In [10]:
import pandas as pd

# Load the datasets
phone_data_df = pd.read_csv('2024_GSM_ML.csv')
models_df = pd.read_csv('user_models.txt', names=['Model'])

# Preprocess the Name column in phone_data_df
phone_data_df['Processed_Name'] = phone_data_df['Name'].str.lower()

# Preprocess the Model column in models_df
models_df['Processed_Model'] = models_df['Model'].str.lower()

# Function to find matches
def find_match(name, models):
    for model in models:
        if model in name:
            return model
    return None

# Apply the matching function
phone_data_df['Matched_Model'] = phone_data_df['Processed_Name'].apply(
    lambda x: find_match(x, models_df['Processed_Model'])
)

# Merge the dataframes
result_df = pd.merge(
    phone_data_df,
    models_df,
    left_on='Matched_Model',
    right_on='Processed_Model',
    how='inner'
)

# Display the matched rows
print(result_df[['Name', 'Model']])


                      Name      Model
0  Apple iPhone 12 Pro Max  iPhone 12
1      Apple iPhone 12 Pro  iPhone 12
2          Apple iPhone 12  iPhone 12
3     Apple iPhone 12 mini  iPhone 12


In [11]:

import pandas as pd

# Load URLs from the text file into a list
with open('phone_links.txt', 'r') as file:
    urls = file.read().splitlines()

# Load the phone models CSV into a DataFrame
models_df = pd.read_csv('user_models.txt')

# Match models to URLs
matching_urls = []
for model in models_df['Model']:
    # Check if the model matches any part of the URL
    matched = [url for url in urls if all(word.lower() in url.lower() for word in model.split())]
    matching_urls.append(matched if matched else None)

# Add the matching URLs as a new column in the DataFrame
models_df['URLs'] = matching_urls

# Save the updated DataFrame to a new CSV (optional)
models_df.to_csv('models_with_urls.csv', index=False)

# Display the DataFrame
print(models_df)


FileNotFoundError: [Errno 2] No such file or directory: 'phone_links.txt'

In [None]:
# cleans existing files of false line breaks and special characters
# by ML

import re

# Input and output file paths
input_file = '2023_GSM_raw.csv'
output_file = '2023_GSM_parsed.csv'

# Define regex pattern for valid rows: start with a number and comma
valid_row_pattern = re.compile(r'^\d*,')

# Read and process the file
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    buffer = ""
    for line in infile:
        line = line.replace("\n", "").replace("\r", "").replace("‑", "-").replace(" ", " ").replace(" ", " ").replace("   ", " ").replace("  ", " ")    # remove linebreaks & spec. char.
        if valid_row_pattern.match(line):  # Line matches the pattern, it's a valid new row
            if buffer:
                outfile.write(buffer.strip() + "\n")  # Write the previous valid row
            buffer = line  # Start a new buffer
        else:
            buffer += " " + line.strip()  # Append invalid line to the current buffer

    # Write the last buffered row if exists
    if buffer:
        outfile.write(buffer.strip() + "\n")

In [11]:
#extract list of all brands
import pandas as pd
import os

cwd = os.getcwd()
input_file = 'brand_links_all.txt'
output_file = 'unique_brands.txt'

df = pd.read_csv(input_file, header=None, names=['url'])


# Extract substring between ".com/" and "-phones"
df['brand'] = df['url'].str.extract(r'\.com\/(.*?)\-phones')

unique_models = df['brand'].dropna().drop_duplicates().replace('_', ' ', regex=True)  # Remove NaN values and get unique brands

# Store to file
#with open(output_csv, 'w') as f:
#    for brand in sorted(unique_brands):
#        f.write(f"{brand}\n")

unique_models_df = unique_models.reset_index(drop=True)  # Reset index for a clean CSV file
unique_models_df.to_csv(output_file, index=False, header=['Brand'])

print(f"Extracted {len(unique_models)} unique brands. Saved to '{output_file}'.")

Extracted 119 unique brands. Saved to 'unique_brands.txt'.


In [9]:
#extract list of all models
import pandas as pd
import os

cwd = os.getcwd()
input_file = '../../user/user_behavior_dataset.csv'
output_file = 'user_models.txt'

df = pd.read_csv(input_file, header='infer')

# Extract substring between ".com/" and "-phones"

unique_models = df['Device Model'].dropna().drop_duplicates()  # Remove NaN values and get unique models

unique_models_df = unique_models.reset_index(drop=True)  # Reset index for a clean CSV file
unique_models_df.to_csv(output_file, index=False, header=['Model'])

print(unique_models)
print(f"Extracted {len(unique_models)} unique models. Saved to '{output_file}'.")

0        Google Pixel 5
1             OnePlus 9
2          Xiaomi Mi 11
4             iPhone 12
6    Samsung Galaxy S21
Name: Device Model, dtype: object
Extracted 5 unique models. Saved to 'user_models.txt'.


In [None]:
# Seperates Name Column into brand and model

import pandas as pd

# Input file paths
input_csv = '2024_GSM_ML.csv'
brands_csv = 'brand_links_all.csv'

# Load the devices and brand list
devices_df = pd.read_csv(input_csv)
to_split_col = 'Name'

brands_df = pd.read_csv(brands_csv)

# Convert brand list to a set for efficient matching
brands = brands_df['brand'].str.lower().tolist()

# Function to split name into brand and model
def split_brand_model(name):
    name_lower = name.lower()
    # Find the longest matching brand in the name
    matched_brand = max((brand for brand in brands if name_lower.startswith(brand)), key=len, default=None)
    if matched_brand:
        brand = matched_brand.title()  # Capitalize correctly
        model = name[len(matched_brand):].strip()  # Remaining part of the name is the model
        return pd.Series([brand, model])
    else:
        # If no brand is matched, return the full name as model with NaN for brand
        return pd.Series([None, name])

# Apply the split to the 'Name' column
devices_df[['Brand', 'Model']] = devices_df['Name'].apply(split_brand_model)

# Save the output to a new CSV
output_csv = 'devices_with_brands.csv'
devices_df.to_csv(output_csv, index=False)

print(f"Processed data saved to '{output_csv}'")



In [21]:
# Combine brand and model to name & remove duplicates
def parse_df(file, target):
    df = pd.read_csv(file, delimiter=",", header="infer")
    # All column headers into lower case
    df.columns = df.columns.str.lower()
    
    # Create unique Name col from brand and model
    if 'name' not in df.columns:
        new_col = df['brand'] + ' ' + df['model']
        df.insert(0, 'id', new_col)
    else:
        df = df.rename(columns={'name': 'id'})
    
    # Check for duplicates
    duplicate_rows = df[df.duplicated(subset=['id'], keep=False)]
    if duplicate_rows.size>0:
        print(f"caution, df {file} contains duplicate")

    df.to_csv(target, index=False)

new_path = "../1_parsed_data/"
file_list = ["2017_GSM.csv", new_path+"2023_GSM_parsed.csv"]
target_list = [new_path+"2017_GSM_parsed.csv", new_path+"2023_GSM_parsed.csv"]

for file, target in zip(file_list, target_list):
    parse_df(file, target)



0
0


In [None]:
# 2017
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file = "2017_GSM.csv"
write_file = f"../1_parsed_data/2017_GSM_parsed.csv"

df_orig = pd.read_csv(read_file, delimiter=",", header="infer")

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(re.search(regex, value).group(pick_group))
        elif type == "float":
            return float(re.search(regex, value).group(pick_group))
    except:
        pass
    return None

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str)->float:
    result = None
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        try:
            result = float(re.search(r'(\d+) GB', input, flags=re.IGNORECASE)[0][:-3])
        except TypeError:
            try:
                result = float(re.search(r'(\d+) MB', input, flags=re.IGNORECASE)[0][:-3])/1028
            except TypeError:
                try:
                    result = (float(re.search(r'(\d+) KB', input, flags=re.IGNORECASE)[0][:-3])/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    value = str(value).strip()
    pattern = r"(?<=Released\s| release\s)(\d{4})\s+(Q[1-4]|[A-Za-z]+)"
    match = re.search(pattern, value)
    if match:
        year, date_part = match.groups()
        if date_part.startswith('Q'):  # Handle quarter
            month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
            date = f"{year}-{month}-01"
            return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
        else:  # Handle month
            try:
                date = f"{year}-{date_part}-01"
                return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
            except ValueError:
                return None
    else:
        try:
            year, month = value.split("  ")
            date = f"{year}-{month}-01"
            return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
        except ValueError:
            return None

    # Return None if no valid date pattern found
    return None

# data cleaning:
df = df_orig.copy(deep=True)

# dropping 'No cellular connectivity' , none values in model
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    (df['model'].isnull()) |
    (df['network_technology'] == 'No cellular connectivity')
].index, inplace=True)

# because model is not a unique value for all rows, we add brand to it. drop tablets & watches
keywords = ["tablet", "watch", "ipad", "pad"]
df['id'] = df['brand'] + ' ' + df['model']
df.drop(df.loc[df['id'].str.contains("|".join(keywords), case=False, na=False)].index, inplace=True)

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['internal_memory'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('internal_memory')+1, 'internal_memory_in_GB', new_col)
df.drop(columns=['internal_memory'], inplace=True)

# 'memory_card' regex for microSD  up to 16 GB (dedicated slot) -> make numerical
new_col = df['memory_card'].apply(lambda x: number_or_none(value=x, regex='(\d+) GB'))
df.insert(df.columns.get_loc('memory_card')+1, 'max_memory_card_size_GB', new_col)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
ram_gb = df['RAM'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('RAM')+1, 'RAM_in_GB', ram_gb)
df.drop(columns=['RAM'], inplace=True)

# 'primary_camera' -> extract MP
new_col = df['primary_camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('primary_camera')+1, 'primary_camera_mega_pixel', new_col)

# 'secondary_camera' -> extract MP
new_col = df['secondary_camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('secondary_camera')+1, 'secondary_camera_mega_pixel', new_col)

# Apply parse_date function to the 'status' and 'announced' columns
df['status_date'] = df['status'].apply(parse_date)
df['announced_date'] = df['announced'].apply(parse_date)

# Combine the 'status_date' and 'announced_date' columns into 'date'
df['date'] = df['status_date'].combine_first(df['announced_date'])
df['year'] = df['date'].dt.year

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display_resolution'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.insert(df.columns.get_loc('display_resolution')+1, 'display_size_inches', new_col)

df.rename(columns={'display_size': 'display_width_length'}, inplace=True)

# any weight in numeric format
df['weight_g'] = pd.to_numeric(df['weight_g'], errors='coerce')
df['weight_oz'] = pd.to_numeric(df['weight_oz'], errors='coerce')

# battery mAh to numeric
new_col = df['battery'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery_mAh', new_col)


## -> generally negative values are No or NaN, any other value means: exists

# keep only relevant
df.columns = df.columns.str.lower()
relevant = ["id", "brand", "model", "date", "year",  "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "primary_camera", "loud_speaker", "gps", "colors", "approx_price_eur", "battery_mah", "ram_in_gb"]

df_relevant = df[relevant]

df_relevant.to_csv(write_file, sep=',', index=False)



  new_col = df['memory_card'].apply(lambda x: number_or_none(value=x, regex='(\d+) GB'))


In [None]:
# 2023
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file = "2023_GSM_parsed.csv"
write_file = f"../1_parsed_data/2023_GSM_parsed.csv"

df_orig = pd.read_csv(read_file, delimiter=",", header="infer")

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(re.search(regex, value).group(pick_group))
        elif type == "float":
            return float(re.search(regex, value).group(pick_group))
    except:
        pass
    return None

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str)->float:
    result = None
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        try:
            result = float(re.search(r'(\d+) GB', input, flags=re.IGNORECASE)[0][:-3])
        except TypeError:
            try:
                result = float(re.search(r'(\d+) MB', input, flags=re.IGNORECASE)[0][:-3])/1028
            except TypeError:
                try:
                    result = (float(re.search(r'(\d+) KB', input, flags=re.IGNORECASE)[0][:-3])/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    value = str(value).strip()
    pattern = r"(?<=Released\s)(\d{4})\s+([A-Za-z]+|Q[1-4])"
    match = re.search(pattern, value)
    if match:
        year, date_part = match.groups()
        if date_part.startswith('Q'):  # Handle quarter
            month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
            return f"{year}-{month}-01"
        else:  # Handle month
            try:
                return pd.Timestamp(f"{year} {date_part}")
            except ValueError:
                return None

    # Return None if no valid date pattern found
    return None

# data cleaning:
df = df_orig.copy(deep=True)

# dropping 'discontinued' 
# dropping 'No cellular connectivity'
# dropping rows without price
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    ##(df['approx_price_EUR'].isnull()) |
    (df['network_technology'] == 'No cellular connectivity')
].index, inplace=True)

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['memory(internal)'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('memory(internal)')+1, 'internal_memory_in_GB', new_col)
df.drop(columns=['memory(internal)'], inplace=True)

# 'memory_card' regex for microSD  up to 16 GB (dedicated slot) -> make numerical
#new_col = df['memory_card'].apply(lambda x: #number_or_none(value=x, regex='(\d+) GB'))
#df.insert(df.columns.get_loc('memory_card')+1, #'max_memory_card_size_GB', new_col)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
#ram_gb = df['RAM'].apply(lambda x: conv2gb(x))
#df.insert(df.columns.get_loc('RAM')+1, #'RAM_in_GB', ram_gb)
#df.drop(columns=['RAM'], inplace=True)

# 'primary_camera' -> extract MP
new_col = df['main camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('main camera')+1, 'primary_camera_mega_pixel', new_col)

# 'secondary_camera' -> extract MP
new_col = df['front camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('front camera')+1, 'secondary_camera_mega_pixel', new_col)

# 'announced'
#df['announced_date'] = df['announced'].apply(parse_date)
#df['announced_date'] = pd.to_datetime(df['announced_date'], errors='coerce')

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display_resolution'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.insert(df.columns.get_loc('display_resolution')+1, 'display_size_inches', new_col)

df.rename(columns={'display_size': 'display_width_length'}, inplace=True)

# any weight in numeric format
df['weight_g'] = pd.to_numeric(df['weight_g'], errors='coerce')
df['weight_oz'] = pd.to_numeric(df['weight_oz'], errors='coerce')

# battery mAh to numeric
new_col = df['battery'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery_mAh', new_col)


#TODO: split staus into 2 col, second parsed as date
#TODO: remove id col from csv

## -> generally negative values are No or NaN, any other value means: exists

# keep only relevant
relevant = ["id", "brand", "model", "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "loud_speaker", "gps", "colors", "approx_price_eur", "announced", "battery_mah", "RAM_in_GB"]

df_relevant = df[relevant]


df_relevant.to_csv(write_file, sep=',')



In [8]:
#2024
# parse numerical cols
# keep only relevant

import string
import re
import pandas as pd

#csv_path = "../data/"
read_file1 = "2024_GSM_adam.csv"
read_file2 = "2024_GSM_ML.csv"
read_file3 = "2024_GSM_secondary.csv"
write_file = f"../1_parsed_data/2024_GSM_parsed.csv"

df_1 = pd.read_csv(read_file1, delimiter=",", header="infer")
df_2 = pd.read_csv(read_file2, delimiter=",", header="infer")
df_3 = pd.read_csv(read_file3, delimiter=",", header="infer")
df_orig = pd.concat([df_1, df_2, df_3], ignore_index=True)

# Function to extract value usig regex. if not possible return none.
def number_or_none(regex, value, pick_group=1, type="int"):
    try:
        if type == "int":
            return int(not_none_group(re.search(regex, value), pick_group))
        elif type == "float":
            return float(not_none_group(re.search(regex, value), pick_group))
    except:
        pass
    return None

def not_none_group(match, pick_group):
    result = match.group(pick_group)
    if result == None:
        result = next((g for g in match.groups() if g is not None), None) # Pick first group thats not none
    return result
    

# Function to turn text into number and mb/kb into gb
def conv2gb(input:str, ram=False)->float:
    result = None
    pick_group = 1
    if (input == '') | (input == None):
        return result
    if isinstance(input, str):
        input = re.sub(r"[{}]".format(re.escape(string.punctuation)), ' ', input)
        if ram:
            gb = r'(\d+)GB RAM'
            mb = r'(\d+)MB RAM'
            kb = r'(\d+)KB RAM'
        else:
            gb = r'(\d+)GB'
            mb = r'(\d+)MB'
            kb = r'(\d+)KB'
        try:
            result = float(re.search(gb, input, flags=re.IGNORECASE).group(pick_group))
        except:
            try:
                result = float(re.search(mb, input, flags=re.IGNORECASE).group(pick_group))/1028
            except:
                try:
                    result = (float(re.search(kb, input, flags=re.IGNORECASE).group(pick_group))/1028)/1028
                except:
                    pass
    return result

# Standardize the date format
def parse_date(value):
    if pd.isna(value):  # Handle NaN values
        return None
    value = str(value).strip()

    # eg. "Available. Released 2024, April 06"
    pattern = r"(?<=Released\s)(\d{4}),\s+([A-Za-z]+) (\d{2})"
    match = re.search(pattern, value)
    if match:
        year, month, day = match.groups()
        try:
            date = f"{year}-{month}-{day}"
            return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
        except ValueError:
            return None
    else:   # eg. "Available. Released 2024, April" or "Available. Released 2024, Q1"
        pattern = r"(?<=Released\s)(\d{4}),\s+(Q[1-4]|[A-Za-z]+)"
        match = re.search(pattern, value)
        if match:
            year, date_part = match.groups()
            if date_part.startswith('Q'):  # Handle quarter
                month = {'Q1': '03', 'Q2': '06', 'Q3': '09', 'Q4': '12'}[date_part]
                date = f"{year}-{month}-01"
                return pd.to_datetime(date, format="%Y-%m-%d", errors='coerce')
            else:  # Handle month
                try:
                    date = f"{year}-{date_part}-01"
                    return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
                except ValueError:
                    return None
        else:   # eg. "2024, April" or "Available. Released 2024, Q1"
            try:
                year, month = value.split(",  ")
                date = f"{year}-{month}-01"
                return pd.to_datetime(date, format="%Y-%B-%d", errors='coerce')
            except ValueError:
                return None

    # Return None if no valid date pattern found
    return None

# data cleaning:
df = df_orig.copy(deep=True)
df.columns = df.columns.str.lower()
df = df.drop_duplicates(subset='name', keep='first')


# dropping 'discontinued' 
# dropping 'No cellular connectivity'
# dropping rows without price
df.drop(df.loc[
    ##(df['status'] == 'Discontinued') | 
    ##(df['status'] == 'Cancelled') |  
    ##(df['network_technology'] == 'No cellular connectivity') |
    (df['name'].isnull())
].index, inplace=True)

# because model is not a unique value for all rows, we add brand to it.
df.rename(columns={'name': 'id'}, inplace=True)
df.drop(df.loc[df['id'].str.contains("|".join(keywords), case=False, na=False)].index, inplace=True)

# 'internal_memory' regex to identify __ GB * to make col numeric.find MB and KB and convert to GB
new_col = df['memory(internal)'].apply(lambda x: conv2gb(x))
df.insert(df.columns.get_loc('memory(internal)')+1, 'internal_memory_in_GB', new_col)
df.rename(columns={'memory(internal)': 'internal_memory'},  inplace=True)

# 'RAM' regex 128 MB ROM| 64 MB RAM -> make numerical
new_col = df['internal_memory'].apply(lambda x: conv2gb(x, ram=True))
df.insert(df.columns.get_loc('internal_memory_in_GB')+1, 'ram_in_gb', new_col)

# 'primary_camera' -> extract MP
new_col = df['main camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('main camera')+1, 'primary_camera_mega_pixel', new_col)
df.rename(columns={'main camera': 'primary_camera'},  inplace=True)

# 'secondary_camera' -> extract MP
new_col = df['front camera'].apply(lambda x: number_or_none(value=x, regex=r'(\d+) MP'))
df.insert(df.columns.get_loc('front camera')+1, 'secondary_camera_mega_pixel', new_col)
df.rename(columns={'front camera': 'secondary_camera'},  inplace=True)

# Apply parse_date function to the 'status' and 'announced' columns
df['status_date'] = df['released'].apply(parse_date)
df['announced_date'] = df['announced'].apply(parse_date)

# Combine the 'status_date' and 'announced_date' columns into 'date'
df['date'] = df['status_date'].combine_first(df['announced_date'])
df['year'] = df['date'].dt.year

# 'display_resolution' -> 'display_diagonal_inches'
new_col = df['display size'].apply(lambda x: number_or_none(value=x, regex=r'(\d+.\d+) inches', type="float"))
df.rename(columns={'display size': 'display_size'},  inplace=True)
df.insert(df.columns.get_loc('display_size')+1, 'display_size_inches', new_col)

# battery mAh to numeric
new_col = df['battery type'].apply(lambda x: number_or_none(value=x, regex=r'(\d+)\s*mAh', type="int"))
df.rename(columns={'battery type': 'battery_description'}, inplace=True)
df.insert(df.columns.get_loc('battery_description')+1, 'battery_mah', new_col)

# price into numeric euro
new_col = df['price'].apply(lambda x: number_or_none(value=x, regex=r'€\s*([\d.,]+)|(?:[Aa]bout\s+([\d.,]+)\s*EUR)', type="float"))
df.insert(df.columns.get_loc('price')+1, 'approx_price_eur', new_col)

# split id into brand and model
df.insert(df.columns.get_loc('id')+1, 'brand', None)
df.insert(df.columns.get_loc('brand')+1, 'model', None)

## -> generally negative values are No or NaN, any other value means: exists

# rename for consistency
df.rename(columns={'speker': 'loud_speaker'},  inplace=True)

# keep only relevant
df.columns = df.columns.str.lower()
relevant = ["id", "brand", "model", "date", "year",  "display_size_inches", "internal_memory_in_gb", "primary_camera_mega_pixel", "primary_camera", "loud_speaker", "gps", "colors", "approx_price_eur", "battery_mah", "ram_in_gb"]

df_relevant = df[relevant]


df_relevant.to_csv(write_file, sep=',', index=False)

