In [None]:
# cleans existing files of false line breaks and special characters
# by ML

import re

# Input and output file paths
input_file = '2023_GSM_raw.csv'
output_file = '2023_GSM_parsed.csv'

# Define regex pattern for valid rows: start with a number and comma
valid_row_pattern = re.compile(r'^\d*,')

# Read and process the file
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    buffer = ""
    for line in infile:
        line = line.replace("\n", "").replace("\r", "").replace("‑", "-").replace(" ", " ").replace(" ", " ").replace("   ", " ").replace("  ", " ")    # remove linebreaks & spec. char.
        if valid_row_pattern.match(line):  # Line matches the pattern, it's a valid new row
            if buffer:
                outfile.write(buffer.strip() + "\n")  # Write the previous valid row
            buffer = line  # Start a new buffer
        else:
            buffer += " " + line.strip()  # Append invalid line to the current buffer

    # Write the last buffered row if exists
    if buffer:
        outfile.write(buffer.strip() + "\n")

In [11]:
#extract list of all brands
import pandas as pd
import os

cwd = os.getcwd()
input_file = 'brand_links_all.txt'
output_file = 'unique_brands.txt'

df = pd.read_csv(input_file, header=None, names=['url'])


# Extract substring between ".com/" and "-phones"
df['brand'] = df['url'].str.extract(r'\.com\/(.*?)\-phones')

unique_brands = df['brand'].dropna().drop_duplicates().replace('_', ' ', regex=True)  # Remove NaN values and get unique brands

# Store to file
#with open(output_csv, 'w') as f:
#    for brand in sorted(unique_brands):
#        f.write(f"{brand}\n")

unique_brands_df = unique_brands.reset_index(drop=True)  # Reset index for a clean CSV file
unique_brands_df.to_csv(output_file, index=False, header=['Brand'])

print(f"Extracted {len(unique_brands)} unique brands. Saved to '{output_file}'.")

Extracted 119 unique brands. Saved to 'unique_brands.txt'.


In [None]:
# Seperates Name Column into brand and model

import pandas as pd

# Input file paths
input_csv = '2024_GSM_ML.csv'
brands_csv = 'brand_links_all.csv'

# Load the devices and brand list
devices_df = pd.read_csv(input_csv)
to_split_col = 'Name'

brands_df = pd.read_csv(brands_csv)

# Convert brand list to a set for efficient matching
brands = brands_df['brand'].str.lower().tolist()

# Function to split name into brand and model
def split_brand_model(name):
    name_lower = name.lower()
    # Find the longest matching brand in the name
    matched_brand = max((brand for brand in brands if name_lower.startswith(brand)), key=len, default=None)
    if matched_brand:
        brand = matched_brand.title()  # Capitalize correctly
        model = name[len(matched_brand):].strip()  # Remaining part of the name is the model
        return pd.Series([brand, model])
    else:
        # If no brand is matched, return the full name as model with NaN for brand
        return pd.Series([None, name])

# Apply the split to the 'Name' column
devices_df[['Brand', 'Model']] = devices_df['Name'].apply(split_brand_model)

# Save the output to a new CSV
output_csv = 'devices_with_brands.csv'
devices_df.to_csv(output_csv, index=False)

print(f"Processed data saved to '{output_csv}'")