In [None]:
# cleans existing files of false line breaks and special characters
# by ML

import re

# Input and output file paths
input_file = '2023_GSM_raw.csv'
output_file = '2023_GSM_parsed.csv'

# Define regex pattern for valid rows: start with a number and comma
valid_row_pattern = re.compile(r'^\d*,')

# Read and process the file
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    buffer = ""
    for line in infile:
        line = line.replace("\n", "").replace("\r", "").replace("‑", "-").replace(" ", " ").replace(" ", " ").replace("   ", " ").replace("  ", " ")    # remove linebreaks & spec. char.
        if valid_row_pattern.match(line):  # Line matches the pattern, it's a valid new row
            if buffer:
                outfile.write(buffer.strip() + "\n")  # Write the previous valid row
            buffer = line  # Start a new buffer
        else:
            buffer += " " + line.strip()  # Append invalid line to the current buffer

    # Write the last buffered row if exists
    if buffer:
        outfile.write(buffer.strip() + "\n")

In [11]:
#extract list of all brands
import pandas as pd
import os

cwd = os.getcwd()
input_file = 'brand_links_all.txt'
output_file = 'unique_brands.txt'

df = pd.read_csv(input_file, header=None, names=['url'])


# Extract substring between ".com/" and "-phones"
df['brand'] = df['url'].str.extract(r'\.com\/(.*?)\-phones')

unique_brands = df['brand'].dropna().drop_duplicates().replace('_', ' ', regex=True)  # Remove NaN values and get unique brands

# Store to file
#with open(output_csv, 'w') as f:
#    for brand in sorted(unique_brands):
#        f.write(f"{brand}\n")

unique_brands_df = unique_brands.reset_index(drop=True)  # Reset index for a clean CSV file
unique_brands_df.to_csv(output_file, index=False, header=['Brand'])

print(f"Extracted {len(unique_brands)} unique brands. Saved to '{output_file}'.")

Extracted 119 unique brands. Saved to 'unique_brands.txt'.


In [None]:
# Seperates Name Column into brand and model

import pandas as pd

# Input file paths
input_csv = '2024_GSM_ML.csv'
brands_csv = 'brand_links_all.csv'

# Load the devices and brand list
devices_df = pd.read_csv(input_csv)
to_split_col = 'Name'

brands_df = pd.read_csv(brands_csv)

# Convert brand list to a set for efficient matching
brands = brands_df['brand'].str.lower().tolist()

# Function to split name into brand and model
def split_brand_model(name):
    name_lower = name.lower()
    # Find the longest matching brand in the name
    matched_brand = max((brand for brand in brands if name_lower.startswith(brand)), key=len, default=None)
    if matched_brand:
        brand = matched_brand.title()  # Capitalize correctly
        model = name[len(matched_brand):].strip()  # Remaining part of the name is the model
        return pd.Series([brand, model])
    else:
        # If no brand is matched, return the full name as model with NaN for brand
        return pd.Series([None, name])

# Apply the split to the 'Name' column
devices_df[['Brand', 'Model']] = devices_df['Name'].apply(split_brand_model)

# Save the output to a new CSV
output_csv = 'devices_with_brands.csv'
devices_df.to_csv(output_csv, index=False)

print(f"Processed data saved to '{output_csv}'")



In [21]:
# Combine brand and model to name & remove duplicates
def parse_df(file, target):
    df = pd.read_csv(file, delimiter=",", header="infer")
    # All column headers into lower case
    df.columns = df.columns.str.lower()
    
    # Create unique Name col from brand and model
    if 'name' not in df.columns:
        new_col = df['brand'] + ' ' + df['model']
        df.insert(0, 'id', new_col)
    else:
        df = df.rename(columns={'name': 'id'})
    
    # Check for duplicates
    duplicate_rows = df[df.duplicated(subset=['id'], keep=False)]
    if duplicate_rows.size>0:
        print(f"caution, df {file} contains duplicate")

    df.to_csv(target, index=False)

new_path = "../1_parsed_data/"
file_list = ["2017_GSM.csv", new_path+"2023_GSM_parsed.csv"]
target_list = [new_path+"2017_GSM_parsed.csv", new_path+"2023_GSM_parsed.csv"]

for file, target in zip(file_list, target_list):
    parse_df(file, target)



0
0


In [None]:
# 2017
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file = "2017_GSM.csv"
write_file = f"../1_parsed_data/{read_file}"

df_orig = pd.read_csv(read_file, delimiter=",", header="infer")

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(re.search(regex, value).group(pick_group))
        elif type == "float":
            return float(re.search(regex, value).group(pick_group))
    except:
        pass
    return None

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str)->float:
    result = None
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        try:
            result = float(re.search(r'(\d+) GB', input, flags=re.IGNORECASE)[0][:-3])
        except TypeError:
            try:
                result = float(re.search(r'(\d+) MB', input, flags=re.IGNORECASE)[0][:-3])/1028
            except TypeError:
                try:
                    result = (float(re.search(r'(\d+) KB', input, flags=re.IGNORECASE)[0][:-3])/1028)/1028
                except:
                    pass
    return result

# Function to standardize the date format
def parse_announced(value):
    if pd.isna(value):  # Handle NaN values
        return None
    value = str(value).strip()
    if 'Q' in value:  # Handle quarters
        year, quarter = value.split()
        month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}.get(quarter, '01')
        return f"{year}-{month}-01"
    elif any(month in value for month in
             ['January', 'February', 'March', 'April', 'May', 'June',
              'July', 'August', 'September', 'October', 'November', 'December']):  # Handle months
        return pd.to_datetime(value, format='%Y %B', errors='coerce')
    elif len(value)==4:  # Handle year only
        return f"{value}-01-01"
    else:
        return None

# data cleaning:
df = df_orig.copy(deep=True)

# dropping 'discontinued' 
# dropping 'No cellular connectivity'
# dropping rows without price
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    ##(df['approx_price_EUR'].isnull()) |
    (df['network_technology'] == 'No cellular connectivity')
].index, inplace=True)

# because model is not a unique value for all rows, we add brand to it.
df['Name'] = df['brand'] + ' ' + df['model']

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['internal_memory'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('internal_memory')+1, 'internal_memory_in_GB', new_col)
df.drop(columns=['internal_memory'], inplace=True)

# 'memory_card' regex for microSD  up to 16 GB (dedicated slot) -> make numerical
new_col = df['memory_card'].apply(lambda x: number_or_none(value=x, regex='(\d+) GB'))
df.insert(df.columns.get_loc('memory_card')+1, 'max_memory_card_size_GB', new_col)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
ram_gb = df['RAM'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('RAM')+1, 'RAM_in_GB', ram_gb)
df.drop(columns=['RAM'], inplace=True)

# 'primary_camera' -> extract MP
new_col = df['primary_camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('primary_camera')+1, 'primary_camera_mega_pixel', new_col)

# 'secondary_camera' -> extract MP
new_col = df['secondary_camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('secondary_camera')+1, 'secondary_camera_mega_pixel', new_col)

# 'announced'
#df['announced_date'] = df['announced'].apply(parse_announced)
#df['announced_date'] = pd.to_datetime(df['announced_date'], errors='coerce')

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display_resolution'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.insert(df.columns.get_loc('display_resolution')+1, 'display_size_inches', new_col)

df.rename(columns={'display_size': 'display_width_length'}, inplace=True)

# any weight in numeric format
df['weight_g'] = pd.to_numeric(df['weight_g'], errors='coerce')
df['weight_oz'] = pd.to_numeric(df['weight_oz'], errors='coerce')

# battery mAh to numeric
new_col = df['battery'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery', new_col)


#TODO: split staus into 2 col, second parsed as date
#TODO: remove id col from csv

## -> generally negative values are No or NaN, any other value means: exists

# keep only relevant


df.to_csv(write_file, sep=',')

relevant = ["display_size_inches", "internal_memory_in_GB", "primary_camera", "loud_speaker", "GPS", "colors", "approx_price_EUR", "announced_date", "battery", "internal_memory", "approx_price_eur", "ram"]

  new_col = df['memory_card'].apply(lambda x: number_or_none(value=x, regex='(\d+) GB'))


Unnamed: 0,brand,model,network_technology,2G_bands,3G_bands,4G_bands,network_speed,GPRS,EDGE,announced,...,NFC,radio,USB,sensors,battery_description,battery,colors,approx_price_EUR,img_url,Name
0,Acer,Iconia Talk S,GSM / HSPA / LTE,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2,HSDPA 850 / 1900 / 2100,LTE band 1(2100)| 3(1800)| 7(2600)| 8(900)| 20...,HSPA 42.2/11.5 Mbps LTE Cat4 150/50 Mbps,Yes,Yes,2016 August,...,,FM radio,microUSB 2.0,Accelerometer| proximity,Non-removable Li-Ion 3400 mAh battery (12.92 Wh),3400.0,Black,170.0,http://cdn2.gsmarena.com/vv/bigpic/acer-iconia...,Acer Iconia Talk S
1,Acer,Liquid Z6 Plus,GSM / HSPA / LTE,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (d...,HSDPA,LTE,HSPA 42.2/5.76 Mbps LTE Cat4 150/50 Mbps,Yes,Yes,2016 August,...,,FM radio,microUSB 2.0,Fingerprint (front-mounted)| accelerometer| pr...,Removable Li-Po 4080 mAh battery,4080.0,Black| White,250.0,http://cdn2.gsmarena.com/vv/bigpic/acer-liquid...,Acer Liquid Z6 Plus
2,Acer,Liquid Z6,GSM / HSPA / LTE,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (d...,HSDPA,LTE,HSPA LTE,Yes,Yes,2016 August,...,,FM radio,microUSB 2.0,Accelerometer| proximity,Removable Li-Ion 2000 mAh battery,2000.0,Black| White,120.0,http://cdn2.gsmarena.com/vv/bigpic/acer-liquid...,Acer Liquid Z6
4,Acer,Liquid X2,GSM / HSPA / LTE,GSM 850 / 900 / 1800 / 1900,HSDPA 900 / 1900 / 2100 - Europe| Taiwan,LTE 800 / 1800 / 2100 / 2600 - Europe,HSPA 42.2/5.76 Mbps LTE Cat4 150/50 Mbps,Yes,Yes,2015 April,...,,FM radio,microUSB 2.0,Accelerometer| proximity| compass,Removable Li-Po 4020 mAh battery,4020.0,Black| Gold,230.0,http://cdn2.gsmarena.com/vv/bigpic/acer-liquid...,Acer Liquid X2
5,Acer,Liquid Jade 2,GSM / HSPA / LTE,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2,HSDPA 900 / 2100,LTE,HSPA 42.2/5.76 Mbps LTE Cat4 150/50 Mbps,Yes,Yes,2016 February,...,,FM radio,microUSB 2.0,Accelerometer| gyro| proximity| compass,,,Black,,http://cdn2.gsmarena.com/vv/bigpic/acer-liquid...,Acer Liquid Jade 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8626,ZTE,F600,GSM / UMTS,GSM 850 / 900 / 1800 / 1900,UMTS 850 / 1900 / 2100,,Yes 384 kbps,Class 10,Class 10,2009. Released 2009,...,,FM radio,2.0,,Removable Li-Ion 800 mAh battery,800.0,Black,,http://cdn2.gsmarena.com/vv/bigpic/zte-f600.jpg,ZTE F600
8627,ZTE,F103,GSM / UMTS,GSM 850 / 900 / 1800 / 1900,UMTS 850 / 1900 / 2100,,Yes 384 kbps,Class 10,Class 10,2009. Released 2009,...,,FM radio,2.0,,Removable Li-Ion 1000 mAh battery,1000.0,Black| Silver,,http://cdn2.gsmarena.com/vv/bigpic/zte-f103.jpg,ZTE F103
8628,ZTE,F101,GSM / UMTS,GSM 850 / 900 / 1800 / 1900,UMTS 850 / 1900 / 2100,,Yes 384 kbps,Class 10,Class 10,2009. Released 2009,...,,FM radio,2.0,,Removable Li-Ion 1000 mAh battery,1000.0,White| Red| Green,,http://cdn2.gsmarena.com/vv/bigpic/zte-f101.jpg,ZTE F101
8629,ZTE,F100,GSM / UMTS,GSM 850 / 900 / 1800 / 1900,UMTS 850 / 1900 / 2100,,Yes 384 kbps,Class 10,Class 10,2009. Released 2009,...,,FM radio,2.0,,Removable Li-Ion 1000 mAh battery,1000.0,White| Red| Green,,http://cdn2.gsmarena.com/vv/bigpic/zte-f100.jpg,ZTE F100
