In [47]:
# cleans existing 20203 raw file of false line breaks and special characters
# by ML

import re

# Input and output file paths
input_file = '2023_GSM_raw.csv'
output_file = '2023_GSM.csv'

# Define regex pattern for valid rows: start with a number and comma
valid_row_pattern = re.compile(r'^\d*,')

# Read and process the file
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    buffer = ""
    for line in infile:
        line = line.replace("\n", "").replace("\r", "").replace("‑", "-").replace(" ", " ").replace(" ", " ").replace("   ", " ").replace("  ", " ")    # remove linebreaks & spec. char.
        if valid_row_pattern.match(line):  # Line matches the pattern, it's a valid new row
            if buffer:
                outfile.write(buffer.strip() + "\n")  # Write the previous valid row
            buffer = line  # Start a new buffer
        else:
            buffer += " " + line.strip()  # Append invalid line to the current buffer

    # Write the last buffered row if exists
    if buffer:
        outfile.write(buffer.strip() + "\n")

In [82]:
# 2017
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file = "2017_GSM.csv"
write_file = f"../1_parsed_data/2017_GSM_parsed.csv"

df_orig = pd.read_csv(read_file, delimiter=",", header="infer")

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(re.search(regex, value).group(pick_group))
        elif type == "float":
            return float(re.search(regex, value).group(pick_group))
    except:
        pass
    return None

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str)->float:
    result = None
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        try:
            result = float(re.search(r'(\d+) GB', input, flags=re.IGNORECASE)[0][:-3])
        except TypeError:
            try:
                result = float(re.search(r'(\d+) MB', input, flags=re.IGNORECASE)[0][:-3])/1028
            except TypeError:
                try:
                    result = (float(re.search(r'(\d+) KB', input, flags=re.IGNORECASE)[0][:-3])/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    # Map for replacing three-letter months with full month names
    month_map = {
        'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
        'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
        'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
    }
    for abbrev, full in month_map.items():
        if re.search(rf"\b{abbrev}\b", value):
            value = re.sub(rf"\b{abbrev}\b", full, value)
            break

    value = str(value).strip()
    pattern = r"(?<=eleased\s|release\s)(\d{4}),?\s+(Q[1-4]|[A-Za-z]+)(?:\s+\d{1,2})?"
    match = re.search(pattern, value)
    if match:
        year, date_part = match.groups()
        if date_part.startswith('Q'):  # Handle quarter
            month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
            date = f"{year}-{month}-01"
            return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
        else:  # Handle month
            try:
                date = f"{year}-{date_part}-01"
                return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
            except ValueError:
                return None
    else:
        pattern = r'(\d{4}),?\s+([1-4]Q|Q[1-4]|[A-Za-z]+)'
        match = re.search(pattern, value)
        if match:
            year, date_part = match.groups()
            if 'Q' in date_part:  # Handle quarter
                month = {'Q1': '01', 'Q2': '06', 'Q3': '09', 'Q4': '12', '1Q': '01', '2Q': '06', '3Q': '09', '4Q': '12'}[date_part]
                date = f"{year}-{month}-01"
                return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
            else:  # Handle month
                try:
                    date = f"{year}-{date_part}-01"
                    return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
                except ValueError:
                    return None
        else:
            pattern = r"(\d{4})$"
            match = re.search(pattern, value)
            if match:
                return pd.to_datetime(f"{match.group(1)}-01-01", format="%Y-%m-%d", errors='coerce')
            else:
                return None

# data cleaning:
df = df_orig.copy(deep=True)

# dropping 'No cellular connectivity' , none values in model
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    (df['model'].isnull()) |
    (df['network_technology'] == 'No cellular connectivity')
].index, inplace=True)

# Parse model col to keep uniform model format
df['model'] = df['model'].apply(lambda x: re.sub(r'(\d)(?=Pro)', r'\1 ', str(x)))
df['model'] = df['model'].apply(lambda x: re.sub(r'(Note)(\d)', r'\1 \2', str(x)))

# because model is not a unique value for all rows, we add brand to it. drop tablets & watches
df['id'] = df['brand'] + ' ' + df['model']

# Dropping obvious non-phones like tablets and watches
keywords = ["tablet", "watch", "ipad", "pad"]
df.drop(df.loc[df['id'].str.contains("|".join(keywords), case=False, na=False)].index, inplace=True)

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['internal_memory'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('internal_memory')+1, 'internal_memory_in_GB', new_col)
df.drop(columns=['internal_memory'], inplace=True)

# 'memory_card' regex for microSD  up to 16 GB (dedicated slot) -> make numerical
new_col = df['memory_card'].apply(lambda x: number_or_none(value=x, regex='(\d+) GB'))
df.insert(df.columns.get_loc('memory_card')+1, 'max_memory_card_size_GB', new_col)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
ram_gb = df['RAM'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('RAM')+1, 'RAM_in_GB', ram_gb)
df.drop(columns=['RAM'], inplace=True)

# 'primary_camera' -> extract MP
new_col = df['primary_camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('primary_camera')+1, 'primary_camera_mega_pixel', new_col)

# 'secondary_camera' -> extract MP
new_col = df['secondary_camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('secondary_camera')+1, 'secondary_camera_mega_pixel', new_col)

# date & year parsed and combined
df['status_date'] = df['status'].apply(parse_date)
df['announced_date'] = df['announced'].apply(parse_date)
df.insert(df.columns.get_loc('status')+1, 'date', df['status_date'].combine_first(df['announced_date']))
df.insert(df.columns.get_loc('date')+1, 'year', df['date'].dt.year)

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display_resolution'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.insert(df.columns.get_loc('display_resolution')+1, 'display_size_inches', new_col)

df.rename(columns={'display_size': 'display_width_length'}, inplace=True)

# any weight in numeric format
df['weight_g'] = pd.to_numeric(df['weight_g'], errors='coerce')
df['weight_oz'] = pd.to_numeric(df['weight_oz'], errors='coerce')

# battery mAh to numeric
new_col = df['battery'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery_mAh', new_col)


## -> generally negative values are No or NaN, any other value means: exists

# keep only relevant
df.columns = df.columns.str.lower()
relevant = ["id", "brand", "model", "date", "year",  "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "primary_camera", "loud_speaker", "gps", "colors", "approx_price_eur", "battery_mah", "ram_in_gb"]

df_relevant = df[relevant]

df_relevant.to_csv(write_file, sep=',', index=False)

print(len(df_relevant))


  new_col = df['memory_card'].apply(lambda x: number_or_none(value=x, regex='(\d+) GB'))


8202


In [83]:
# 2023
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file = "2023_GSM.csv"
write_file = f"../1_parsed_data/2023_GSM_parsed.csv"

df_orig = pd.read_csv(read_file, delimiter=",", header="infer")

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(re.search(regex, value).group(pick_group))
        elif type == "float":
            return float(re.search(regex, value).group(pick_group))
    except:
        pass
    return None

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str, ram=False)->float:
    result = None
    pick_group = 1
    if (input == '') | (input == None):
        return result
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        if ram:
            gb = r'(\d+)GB RAM'
            mb = r'(\d+)MB RAM'
            kb = r'(\d+)KB RAM'
        else:
            gb = r'(\d+)GB'
            mb = r'(\d+)MB'
            kb = r'(\d+)KB'
        try:
            result = float(re.search(gb, input, flags=re.IGNORECASE).group(pick_group))
        except:
            try:
                result = float(re.search(mb, input, flags=re.IGNORECASE).group(pick_group))/1028
            except:
                try:
                    result = (float(re.search(kb, input, flags=re.IGNORECASE).group(pick_group))/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    # Map for replacing three-letter months with full month names
    month_map = {
        'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
        'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
        'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
    }
    for abbrev, full in month_map.items():
        if re.search(rf"\b{abbrev}\b", value):
            value = re.sub(rf"\b{abbrev}\b", full, value)
            break

    value = str(value).strip()
    pattern = r"(?<=eleased\s|release\s)(\d{4}),?\s+(Q[1-4]|[A-Za-z]+)(?:\s+\d{1,2})?"
    match = re.search(pattern, value)
    if match:
        year, date_part = match.groups()
        if date_part.startswith('Q'):  # Handle quarter
            month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
            date = f"{year}-{month}-01"
            return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
        else:  # Handle month
            try:
                date = f"{year}-{date_part}-01"
                return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
            except ValueError:
                return None
    else:
        pattern = r'(\d{4}),?\s+([1-4]Q|Q[1-4]|[A-Za-z]+)'
        match = re.search(pattern, value)
        if match:
            year, date_part = match.groups()
            if 'Q' in date_part:  # Handle quarter
                month = {'Q1': '01', 'Q2': '06', 'Q3': '09', 'Q4': '12', '1Q': '01', '2Q': '06', '3Q': '09', '4Q': '12'}[date_part]
                date = f"{year}-{month}-01"
                return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
            else:  # Handle month
                try:
                    date = f"{year}-{date_part}-01"
                    return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
                except ValueError:
                    return None
        else:
            pattern = r"(\d{4})$"
            match = re.search(pattern, value)
            if match:
                return pd.to_datetime(f"{match.group(1)}-01-01", format="%Y-%m-%d", errors='coerce')
            else:
                return None

# data cleaning:
df = df_orig.copy(deep=True)

# because model is not a unique value for all rows, we add brand to it. drop tablets & watches
# this dataset already usus name as combined col
df.rename(columns={'Name': 'id'}, inplace=True)
df.columns = df.columns.str.lower() # all cols to lower case

# dropping rows with none values in id
df.drop(df.loc[
    (df['id'].isnull())
].index, inplace=True)
df.drop(columns=['unnamed: 0'], inplace=True)

# Parse model col to keep uniform model format
df['id'] = df['id'].apply(lambda x: re.sub(r'(\d)(?=Pro)', r'\1 ', str(x)))
df['id'] = df['id'].apply(lambda x: re.sub(r'(Note)(\d)', r'\1 \2', str(x)))

# Dropping obvious non-phones like tablets and watches
keywords = ["tablet", "watch", "ipad", "pad"]
df.drop(df.loc[df['id'].str.contains("|".join(keywords), case=False, na=False)].index, inplace=True)


# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['memory(internal)'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('memory(internal)')+1, 'internal_memory_in_gb', new_col)
df.rename(columns={'memory(internal)': 'internal_memory'}, inplace=True)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
new_col = df['internal_memory'].apply(lambda x: conv2gb(x, ram=True))
df.insert(df.columns.get_loc('internal_memory_in_gb')+1, 'ram_in_gb', new_col)

# 'primary_camera' -> extract MP
new_col = df['main camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.rename(columns={'main camera': 'primary_camera'}, inplace=True)
df.insert(df.columns.get_loc('primary_camera')+1, 'primary_camera_mega_pixel', new_col)

# 'secondary_camera' -> extract MP
new_col = df['front camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('front camera')+1, 'secondary_camera_mega_pixel', new_col)

# date & year parsed and combined
df['status_date'] = df['released'].apply(parse_date)
df['announced_date'] = df['announced'].apply(parse_date)
df.insert(df.columns.get_loc('released')+1, 'date', df['status_date'].combine_first(df['announced_date']))
df.insert(df.columns.get_loc('date')+1, 'year', df['date'].dt.year)

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display size'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.insert(df.columns.get_loc('display size')+1, 'display_size_inches', new_col)

df.rename(columns={'display size': 'display_size'}, inplace=True)

# battery mAh to numeric
new_col = df['battery type'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery type': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery_mah', new_col)

# price into numeric euro
new_col = df['price'].apply(lambda x: number_or_none(value=x, regex=r'€\s*([\d.,]+)|(?:[Aa]bout\s+([\d.,]+)\s*EUR)', type="float"))
df.insert(df.columns.get_loc('price')+1, 'approx_price_eur', new_col)

## -> generally negative values are No or NaN, any other value means: exists

# add dummy brand and model col for consistency
df.insert(df.columns.get_loc('id')+1, 'brand', None)
df.insert(df.columns.get_loc('brand')+1, 'model', None)

# Rename for consistency
df.rename(columns={'speker': 'loud_speaker'}, inplace=True)

# keep only relevant
relevant = ["id", "brand", "model", "date", "year",  "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "primary_camera", "loud_speaker", "gps", "colors", "approx_price_eur", "battery_mah", "ram_in_gb"]

df_relevant = df[relevant]

df_relevant.to_csv(write_file, sep=',', index=False)

print(len(df_relevant))

113


In [84]:
#2020
# parse numerical cols
# keep only relevant


import string
import re
import pandas as pd

#csv_path = "../data/"
read_file = "2020_GSM.csv"
write_file = f"../1_parsed_data/2020_GSM_parsed.csv"

df_orig = pd.read_csv(read_file, delimiter=",", header="infer")

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(not_none_group(re.search(regex, value), pick_group))
        elif type == "float":
            return float(not_none_group(re.search(regex, value), pick_group))
    except:
        pass
    return None

def not_none_group(match, pick_group):
    result = match.group(pick_group)
    if result == None:
        result = next((g for g in match.groups() if g is not None), None) # Pick first group thats not none
    return result
    

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str, ram=False)->float:
    result = None
    pick_group = 1
    if (input == '') | (input == None):
        return result
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        if ram:
            gb = r'(\d+)GB RAM'
            mb = r'(\d+)MB RAM'
            kb = r'(\d+)KB RAM'
        else:
            gb = r'(\d+)GB'
            mb = r'(\d+)MB'
            kb = r'(\d+)KB'
        try:
            result = float(re.search(gb, input, flags=re.IGNORECASE).group(pick_group))
        except:
            try:
                result = float(re.search(mb, input, flags=re.IGNORECASE).group(pick_group))/1028
            except:
                try:
                    result = (float(re.search(kb, input, flags=re.IGNORECASE).group(pick_group))/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    # Map for replacing three-letter months with full month names
    month_map = {
        'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
        'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
        'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
    }
    for abbrev, full in month_map.items():
        if re.search(rf"\b{abbrev}\b", value):
            value = re.sub(rf"\b{abbrev}\b", full, value)
            break

    value = str(value).strip()
    pattern = r"(?<=eleased\s|release\s)(\d{4}),?\s+(Q[1-4]|[A-Za-z]+)(?:\s+\d{1,2})?"
    match = re.search(pattern, value)
    if match:
        year, date_part = match.groups()
        if date_part.startswith('Q'):  # Handle quarter
            month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
            date = f"{year}-{month}-01"
            return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
        else:  # Handle month
            try:
                date = f"{year}-{date_part}-01"
                return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
            except ValueError:
                return None
    else:
        pattern = r'(\d{4}),?\s+([1-4]Q|Q[1-4]|[A-Za-z]+)'
        match = re.search(pattern, value)
        if match:
            year, date_part = match.groups()
            if 'Q' in date_part:  # Handle quarter
                month = {'Q1': '01', 'Q2': '06', 'Q3': '09', 'Q4': '12', '1Q': '01', '2Q': '06', '3Q': '09', '4Q': '12'}[date_part]
                date = f"{year}-{month}-01"
                return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
            else:  # Handle month
                try:
                    date = f"{year}-{date_part}-01"
                    return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
                except ValueError:
                    return None
        else:
            pattern = r"(\d{4})$"
            match = re.search(pattern, value)
            if match:
                return pd.to_datetime(f"{match.group(1)}-01-01", format="%Y-%m-%d", errors='coerce')
            else:
                return None


# data cleaning:
df = df_orig.copy(deep=True)
df.columns = df.columns.str.lower()

# Parse model col to keep uniform model format
df['model'] = df['model'].apply(lambda x: re.sub(r'(\d)(?=Pro)', r'\1 ', str(x)))
df['model'] = df['model'].apply(lambda x: re.sub(r'(Note)(\d)', r'\1 \2', str(x)))

# because model is not a unique value for all rows, we add brand to it. drop tablets & watches
df['id'] = df['oem'] + ' ' + df['model']
df.rename(columns={'oem': 'brand'},  inplace=True)
df = df.drop_duplicates(subset='id', keep='first')

# dropping 'No cellular connectivity'
# dropping rows without name
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    (df['network_technology'] == 'No cellular connectivity') |
    (df['id'].isnull())
].index, inplace=True)

# Dropping obvious non-phones like tablets and watches
keywords = ["tablet", "watch", "ipad", "pad"]
df.drop(df.loc[df['id'].str.contains("|".join(keywords), case=False, na=False)].index, inplace=True)

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['memory_internal'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('memory_internal')+1, 'internal_memory_in_GB', new_col)
df.rename(columns={'memory_internal': 'internal_memory'},  inplace=True)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
new_col = df['internal_memory'].apply(lambda x: conv2gb(x, ram=True))
df.insert(df.columns.get_loc('internal_memory_in_GB')+1, 'ram_in_gb', new_col)

# 'primary_camera' -> extract MP
new_col_single = df['main_camera_single'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
new_col_dual = df['main_camera_dual'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
new_col_dual_triple = df['main_camera_dual_or_triple'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
new_col_triple = df['main_camera_triple'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
new_col_quad = df['main_camera_quad'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
new_col_five = df['main_camera_five'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))

new_col = new_col_single.combine_first(new_col_dual)
new_col = new_col.combine_first(new_col_dual_triple)
new_col = new_col.combine_first(new_col_triple)
new_col = new_col.combine_first(new_col_quad)
new_col = new_col.combine_first(new_col_five)

df.insert(df.columns.get_loc('main_camera')+1, 'primary_camera_mega_pixel', new_col)
df.rename(columns={'main_camera_features': 'primary_camera'},  inplace=True)

# date & year parsed and combined
df['status_date'] = df['launch_status'].apply(parse_date)
df['announced_date'] = df['launch_announced'].apply(parse_date)
df.insert(df.columns.get_loc('launch_status')+1, 'date', df['status_date'].combine_first(df['announced_date']))
df.insert(df.columns.get_loc('date')+1, 'year', df['date'].dt.year)

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display_size'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.insert(df.columns.get_loc('display_size')+1, 'display_size_inches', new_col)

# battery mAh to numeric
new_col = df['battery'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.insert(df.columns.get_loc('battery')+1, 'battery_mah', new_col)

# price into numeric euro
# <c2><a3><e2><80><89>575.00 / <e2><82><ac><e2><80><89>710.00 / $<e2><80><89>730.00
# second value is eur
new_col = df['misc_price'].apply(lambda x: number_or_none(value=x, regex=r'<e2><82><ac><e2><80><89>\s*([\d.,]+)|(?:[Aa]bout\s+([\d.,]+)\s*EUR)', type="float"))
df.insert(df.columns.get_loc('misc_price')+1, 'approx_price_eur', new_col)

## -> generally negative values are No or NaN, any other value means: exists

# rename for consistency
df.rename(columns={'sound_loudspeaker': 'loud_speaker'},  inplace=True)
df.rename(columns={'comms_gps': 'gps'},  inplace=True)
df.rename(columns={'misc_colors': 'colors'},  inplace=True)

# keep only relevant
df.columns = df.columns.str.lower()
relevant = ["id", "brand", "model", "date", "year",  "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "primary_camera", "loud_speaker", "gps", "colors", "approx_price_eur", "battery_mah", "ram_in_gb"]

df_relevant = df[relevant]

df_relevant.to_csv(write_file, sep=',', index=False)

print(len(df_relevant))

  df_orig = pd.read_csv(read_file, delimiter=",", header="infer")


9592


In [85]:
#2024
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file1 = "2024_GSM_adam.csv"
read_file2 = "2024_GSM_ML.csv"
read_file3 = "2024_GSM_secondary.csv"
read_file4 = "2024_GSM_jess_2.csv"
write_file = f"../1_parsed_data/2024_GSM_parsed.csv"

df_1 = pd.read_csv(read_file1, delimiter=",", header="infer")
df_2 = pd.read_csv(read_file2, delimiter=",", header="infer")
df_3 = pd.read_csv(read_file3, delimiter=",", header="infer")
df_4 = pd.read_csv(read_file4, delimiter=",", header="infer")
df_orig = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(not_none_group(re.search(regex, value), pick_group))
        elif type == "float":
            return float(not_none_group(re.search(regex, value), pick_group))
    except:
        pass
    return None

def not_none_group(match, pick_group):
    result = match.group(pick_group)
    if result == None:
        result = next((g for g in match.groups() if g is not None), None) # Pick first group thats not none
    return result
    

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str, ram=False)->float:
    result = None
    pick_group = 1
    if (input == '') | (input == None):
        return result
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        if ram:
            gb = r'(\d+)GB RAM'
            mb = r'(\d+)MB RAM'
            kb = r'(\d+)KB RAM'
        else:
            gb = r'(\d+)GB'
            mb = r'(\d+)MB'
            kb = r'(\d+)KB'
        try:
            result = float(re.search(gb, input, flags=re.IGNORECASE).group(pick_group))
        except:
            try:
                result = float(re.search(mb, input, flags=re.IGNORECASE).group(pick_group))/1028
            except:
                try:
                    result = (float(re.search(kb, input, flags=re.IGNORECASE).group(pick_group))/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    # Map for replacing three-letter months with full month names
    month_map = {
        'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
        'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
        'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
    }
    for abbrev, full in month_map.items():
        if re.search(rf"\b{abbrev}\b", value):
            value = re.sub(rf"\b{abbrev}\b", full, value)
            break

    value = str(value).strip()
    pattern = r"(?<=eleased\s|release\s)(\d{4}),?\s+(Q[1-4]|[A-Za-z]+)(?:\s+\d{1,2})?"
    match = re.search(pattern, value)
    if match:
        year, date_part = match.groups()
        if date_part.startswith('Q'):  # Handle quarter
            month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
            date = f"{year}-{month}-01"
            return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
        else:  # Handle month
            try:
                date = f"{year}-{date_part}-01"
                return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
            except ValueError:
                return None
    else:
        pattern = r'(\d{4}),?\s+([1-4]Q|Q[1-4]|[A-Za-z]+)'
        match = re.search(pattern, value)
        if match:
            year, date_part = match.groups()
            if 'Q' in date_part:  # Handle quarter
                month = {'Q1': '01', 'Q2': '06', 'Q3': '09', 'Q4': '12', '1Q': '01', '2Q': '06', '3Q': '09', '4Q': '12'}[date_part]
                date = f"{year}-{month}-01"
                return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
            else:  # Handle month
                try:
                    date = f"{year}-{date_part}-01"
                    return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
                except ValueError:
                    return None
        else:
            pattern = r"(\d{4})$"
            match = re.search(pattern, value)
            if match:
                return pd.to_datetime(f"{match.group(1)}-01-01", format="%Y-%m-%d", errors='coerce')
            else:
                return None


# data cleaning:
df = df_orig.copy(deep=True)
df.columns = df.columns.str.lower()
df = df.drop_duplicates(subset='name', keep='first')

# because model is not a unique value for all rows, we add brand to it.
df.rename(columns={'name': 'id'}, inplace=True)

# dropping 'discontinued' 
# dropping 'No cellular connectivity'
# dropping rows without price
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    ##(df['network_technology'] == 'No cellular connectivity') |
    (df['id'].isnull())
].index, inplace=True)


# Parse model col to keep uniform model format
df['id'] = df['id'].apply(lambda x: re.sub(r'(\d)(?=Pro)', r'\1 ', str(x)))
df['id'] = df['id'].apply(lambda x: re.sub(r'(Note)(\d)', r'\1 \2', str(x)))

# Dropping obvious non-phones like tablets and watches
keywords = ["tablet", "watch", "ipad", "pad"]
df.drop(df.loc[df['id'].str.contains("|".join(keywords), case=False, na=False)].index, inplace=True)

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['memory(internal)'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('memory(internal)')+1, 'internal_memory_in_GB', new_col)
df.rename(columns={'memory(internal)': 'internal_memory'},  inplace=True)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
new_col = df['internal_memory'].apply(lambda x: conv2gb(x, ram=True))
df.insert(df.columns.get_loc('internal_memory_in_GB')+1, 'ram_in_gb', new_col)

# 'primary_camera' -> extract MP
new_col = df['main camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('main camera')+1, 'primary_camera_mega_pixel', new_col)
df.rename(columns={'main camera': 'primary_camera'},  inplace=True)

# 'secondary_camera' -> extract MP
new_col = df['front camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('front camera')+1, 'secondary_camera_mega_pixel', new_col)
df.rename(columns={'front camera': 'secondary_camera'},  inplace=True)

# date & year parsed and combined
df['status_date'] = df['released'].apply(parse_date)
df['announced_date'] = df['announced'].apply(parse_date)
df.insert(df.columns.get_loc('released')+1, 'date', df['status_date'].combine_first(df['announced_date']))
df.insert(df.columns.get_loc('date')+1, 'year', df['date'].dt.year)

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display size'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.rename(columns={'display size': 'display_size'},  inplace=True)
df.insert(df.columns.get_loc('display_size')+1, 'display_size_inches', new_col)

# battery mAh to numeric
new_col = df['battery type'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery type': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery_mah', new_col)

# price into numeric euro
new_col = df['price'].apply(lambda x: number_or_none(value=x, regex=r'€\s*([\d.,]+)|(?:[Aa]bout\s+([\d.,]+)\s*EUR)', type="float"))
df.insert(df.columns.get_loc('price')+1, 'approx_price_eur', new_col)

# add dummy brand and model col for consistency
df.insert(df.columns.get_loc('id')+1, 'brand', None)
df.insert(df.columns.get_loc('brand')+1, 'model', None)

## -> generally negative values are No or NaN, any other value means: exists

# rename for consistency
df.rename(columns={'speker': 'loud_speaker'},  inplace=True)

# keep only relevant
df.columns = df.columns.str.lower()
relevant = ["id", "brand", "model", "date", "year",  "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "primary_camera", "loud_speaker", "gps", "colors", "approx_price_eur", "battery_mah", "ram_in_gb"]

df_relevant = df[relevant]

df_relevant.to_csv(write_file, sep=',', index=False)

print(len(df_relevant))

783


In [None]:
### unused code

In [None]:
# unused
# Seperates Name Column into brand and model

import pandas as pd

# Input file paths
input_csv = '2024_GSM_ML.csv'
brands_csv = 'brand_links_all.csv'

# Load the devices and brand list
devices_df = pd.read_csv(input_csv)
to_split_col = 'Name'

brands_df = pd.read_csv(brands_csv)

# Convert brand list to a set for efficient matching
brands = brands_df['brand'].str.lower().tolist()

# Function to split name into brand and model
def split_brand_model(name):
    name_lower = name.lower()
    # Find the longest matching brand in the name
    matched_brand = max((brand for brand in brands if name_lower.startswith(brand)), key=len, default=None)
    if matched_brand:
        brand = matched_brand.title()  # Capitalize correctly
        model = name[len(matched_brand):].strip()  # Remaining part of the name is the model
        return pd.Series([brand, model])
    else:
        # If no brand is matched, return the full name as model with NaN for brand
        return pd.Series([None, name])

# Apply the split to the 'Name' column
devices_df[['Brand', 'Model']] = devices_df['Name'].apply(split_brand_model)

# Save the output to a new CSV
output_csv = 'devices_with_brands.csv'
devices_df.to_csv(output_csv, index=False)

print(f"Processed data saved to '{output_csv}'")

