In [27]:
import pandas as pd
import numpy as np

In [28]:
import json
import os
import pandas as pd

input_directory = './Jsons'
output_file = './mobile_data.csv'

columns = [
    "name", "brand", "2G", "3G", "4G", "5G", "Announced", "Status", "Weight", "Length", "Width", 
    "Diameter", "SIM", "Display Type", "Display Size", "PPI", "Body Ratio", "OS", "Battery_capacity", 
    "Price", "CPU", "Ratio", "Pixel", "WLAN", "Colors", "Sensors", "Bluetooth", "GPU", 
    "LoudSpeaker", "3.5mm jack", "Chipset", "Network", "Internal", "Card slot", "RAM", "Storage"
]
# Initialize an empty list to store rows for the CSV file
data = []

# Function to extract data from each JSON file
def extract_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = json.load(file)
        
        for top_key, models in content.items():
            # Extract brand name from the key (remove numbers if needed)
            brand_name = ''.join(filter(str.isalpha, top_key))  #"Sony158" -> "Sony"
            print(f"Processing brand: {brand_name}")
            
            # Process each model under the brand
            for model, specs in models.items():
                dimensions = specs.get("Body", [{}])[0].get("Dimensions", "")
                dimensions_split = dimensions.split(' x ') if dimensions else []
                price = next((item.get("Price", "") for item in specs.get("Misc", []) if "Price" in item), "")
                chipset = next((item.get("Chipset", "") for item in specs.get("Platform", []) if "Chipset" in item), "")
                gpu = next((item.get("GPU", "") for item in specs.get("Platform", []) if "GPU" in item), "")
                loudspeaker = specs.get("Sound", [{}])[0].get("Loudspeaker", "") == "Yes" if len(specs.get("Sound", [])) > 0 else False
                jack_3_5mm = specs.get("Sound", [{}])[1].get("3.5mm jack", "") == "Yes" if len(specs.get("Sound", [])) > 1 else False
                ram = next((item.get("Internal", "") for item in specs.get("Memory", []) if "RAM" in item.get("Internal", "")), "")
                storage = next((item.get("Internal", "") for item in specs.get("Memory", []) if "Internal" in item), "")
                memory = specs.get("Memory", [{}])
                internal_memory = next((item.get("Internal", "") for item in memory if "Internal" in item), "")
                card_slot = memory[0].get("Card slot", "")


                # Add the extracted data as a row
                row = {
                    "name": model,
                    "brand": brand_name,
                    "2G": any("2G" in spec.get("Technology", "") for spec in specs.get("Network", [])),
                    "3G": any("3G" in spec.get("Technology", "") for spec in specs.get("Network", [])),
                    "4G": any("4G" in spec.get("Technology", "") for spec in specs.get("Network", [])),
                    "5G": any("5G" in spec.get("Technology", "") for spec in specs.get("Network", [])),
                    "Announced": specs.get("Launch", [{}])[0].get("Announced", ""),
                    "Status": specs.get("Launch", [{}])[1].get("Status", ""),
                    "Weight": specs.get("Body", [{}])[1].get("Weight", ""),
                    "Length": dimensions_split[0] if len(dimensions_split) > 0 else "",
                    "Width": dimensions_split[1] if len(dimensions_split) > 1 else "",
                    "Diameter": dimensions_split[2] if len(dimensions_split) > 2 else "",
                    "SIM": specs.get("Body", [{}])[2].get("SIM", ""),
                    "Display Type": specs.get("Display", [{}])[0].get("Type", ""),
                    "Display Size": specs.get("Display", [{}])[1].get("Size", ""),
                    "PPI": specs.get("Display", [{}])[2].get("Resolution", ""),
                    "Body Ratio": specs.get("Display", [{}])[1].get("Size", "").split("(~")[-1].replace(" screen-to-body ratio)", "") if "(~" in specs.get("Display", [{}])[1].get("Size", "") else "",
                    "OS": specs.get("Platform", [{}])[0].get("OS", ""),
                    "Battery_capacity": specs.get("Battery", [{}])[0].get("Type", ""),
                    "Price": price,
                    "CPU": chipset,
                    "Ratio": specs.get("Display", [{}])[2].get("Resolution", ""),
                    "Pixel": specs.get("Display", [{}])[2].get("Resolution", "").split("(~")[-1] if "(~" in specs.get("Display", [{}])[2].get("Resolution", "") else "",
                    "WLAN": specs.get("Comms", [{}])[0].get("WLAN", ""),
                    "Colors": specs.get("Misc", [{}])[0].get("Colors", ""),
                    "Sensors": specs.get("Features", [{}])[0].get("Sensors", ""),
                    "Bluetooth": specs.get("Comms", [{}])[1].get("Bluetooth", ""),
                    "GPU": gpu,
                    "LoudSpeaker": loudspeaker,
                    "3.5mm jack": jack_3_5mm,
                    "Chipset": chipset,
                    "Network": specs.get("Network", [{}])[0].get("Technology", ""),
                    "Internal": internal_memory,
                    "Card slot": card_slot,
                    "RAM": ram,
                    "Storage": storage
                }
                data.append(row)


# Process each JSON file
for file_name in os.listdir(input_directory):
    if file_name.endswith(".json"):
        print(f"Processing file: {file_name}")
        extract_data(os.path.join(input_directory, file_name))

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data, columns=columns)
#df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"Data has been extracted and saved to {output_file}")


Processing file: Alcatel_data.json
Processing brand: alcatel
Processing file: Samsung_data.json
Processing brand: Samsung
Processing file: Lenovo_data.json
Processing brand: Lenovo
Processing file: ZTE_data.json
Processing brand: ZTE
Processing file: Nokia_data.json
Processing brand: Nokia
Processing file: Blu_data.json
Processing brand: BLU
Processing file: Sony_data.json
Processing brand: Sony
Processing brand: Sony
Processing file: Asus_data.json
Processing brand: Asus
Processing file: Apple_data.json
Processing brand: Apple
Processing file: Xiaomi_data.json
Processing brand: Xiaomi
Processing file: LG_data.json
Processing brand: LG
Processing file: Huawei_data.json
Processing brand: Huawei
Processing file: Infinix_data.json
Processing brand: Infinix
Processing file: HTC_data.json
Processing brand: HTC
Data has been extracted and saved to ./mobile_data.csv


In [29]:
df['Status'] = df['Status'].apply(lambda x: 'Available' if isinstance(x, str) and x.lower().startswith('available') else (
                                   'Coming' if isinstance(x, str) and x.lower().startswith('coming') else (
                                   'discontinued' if pd.notna(x) else x)))

sim_options = ['nano-sim', 'micro-sim', 'mini-sim', 'esim']
df['SIM'] = df['SIM'].str.lower().str.strip()
df['SIM'] = df['SIM'].apply(lambda x: ', '.join([sim for sim in sim_options if pd.notna(x) and sim in x]) if pd.notna(x) else x)
df['SIM'] = np.where(df['SIM'].str.contains(r'\byes\b'), 'mini-sim', df['SIM'])
df['PPI_density'] = df['PPI'].str.extract(r'~(\d+)\s+ppi')
df['PPI'] = df['PPI_density']

df.to_csv(output_file, index=False, encoding='utf-8-sig')