In [30]:
import re
import numpy as np
import pandas as pd
import json_repair
import os

# open the file
with open('latest_phones_combined.json', 'r') as f:
    latest_phones_combined = json_repair.loads(f.read())



# flatten the data
latest_phones_combined_flat = {}
for b in latest_phones_combined:
    for p in latest_phones_combined[b]:
        latest_phones_combined_flat[p] = latest_phones_combined[b][p]

# remove the data with no Lazada entries
latest_phones_combined_flat = {k: v for k, v in latest_phones_combined_flat.items() if v.get('Lazada')}



In [31]:
# add values for the min and max of ram and storage
find_ram = re.compile(r"(\d+)GB (\d+)GB RAM", re.IGNORECASE)
[
    (  
        ['|'.join(set(tup)) for tup in zip(*x)]
     ) if (x := find_ram.findall(latest_phones_combined_flat[p]['specifications']['Memory']['Internal'][0])) 
    else (
        np.nan,
        np.nan
        ) for p in latest_phones_combined_flat]

[['64|128|256', '4'],
 ['64|256|512', '4'],
 ['64|256|512', '4'],
 ['64|128|256', '4'],
 ['64|128|256', '4'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '4'],
 ['128|256|512', '4'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '6'],
 ['128|256|512', '8'],
 ['256|512', '8'],
 ['64|128|256', '3'],
 ['64|128|256', '4'],
 ['128|256', '12|8|16'],
 ['128|256|512', '12|8|16|18'],
 ['128|256|512', '12|8|16'],
 ['256', '12'],
 ['512', '16'],
 ['512', '18'],
 ['256', '12|16'],
 ['512', '16'],
 ['256|512', '12|16'],
 ['512', '16'],
 ['256', '12|16'],
 ['128|256|512', '8|16'],
 ['128', '6'],
 ['256', '12'],
 ['64', '3'],
 ['64', '4'],
 ['128', '8'],
 ['128', '8'],
 ['256', '12|8'],
 ['128', '8'],
 ['258', '8'],
 ['512', '12'],
 ['128', '8'],
 ['128', '8'],
 ['512', '12'],
 ['32', '4'],
 ['64', '4'],
 ['128', '4'],
 ['128', '6'],
 ['64', '4'],
 ['64', '4'],
 ['128', '6']

In [42]:



clean_data = pd.DataFrame()

# add rows as 'phone name key' to the dataframe and column as the 'os'
clean_data = pd.DataFrame(
    index=latest_phones_combined_flat.keys(), 
    columns=[
        'brand',
        'colors',
        'model',
        'battery_size',
        'battery_endurance',
        'weight',
        'os',
        'release',
        'screen_size',
        'screen_resolution',
        'screen_type',
        'camera_count_back',
        'camera_count_front',
        'camera_list_resolution_back',
        'camera_list_resolution_front',
        'camera_resolution_back_recording',
        'camera_resolution_front_recording',
        'chipset',
        'benchmark_antutu',
        'benchmark_geekbench',
        'gpu',
        'ram_lst',
        'storage_lst',
        'storage_type',
        'price_min',
        'price_max',
        'lazada_ratings',
        'lazada_reviews',
        'build_material',
        'has_wifi',
        'has_bluetooth',
        'has_cardslot',
    ])

x = []


# set column 1 header as 'phone'
clean_data.index.name = 'phone'


# add brand
clean_data['brand'] = [latest_phones_combined_flat[p]['brand'] for p in latest_phones_combined_flat]

# add model
clean_data['model'] = [latest_phones_combined_flat[p]['phone_name'] for p in latest_phones_combined_flat]

# add colors
clean_data['colors'] = [latest_phones_combined_flat[p]['specifications']['Misc']['Colors'][0].replace(', ', '|') if latest_phones_combined_flat[p]['specifications']['Misc'].get('Colors') else 'Unknown' for p in latest_phones_combined_flat]

# add battery size
find_battery = re.compile(r"(\d+) mAh", re.IGNORECASE)
# get average battery size
average_battery_size = np.mean([float(find_battery.findall(latest_phones_combined_flat[p]['specifications']['Battery']['Type'][0])[0]) for p in latest_phones_combined_flat if latest_phones_combined_flat[p]['specifications']['Battery'].get('Type') ])
clean_data['battery_size'] = [
    int(
        find_battery.findall(latest_phones_combined_flat[p]['specifications']['Battery']['Type'][0])[0])
        if 
            latest_phones_combined_flat[p]['specifications']['Battery'].get('Type') 
        else 
            np.nan 
        for p in latest_phones_combined_flat]


# add battery endurance
find_b_endurance = re.compile(r"(\d+)h", re.IGNORECASE)
# get average battery endurance
average_battery_endurance = np.mean([float(find_b_endurance.findall(latest_phones_combined_flat[p]['specifications']['Tests']['Battery (old)'][0])[0]) for p in latest_phones_combined_flat  if latest_phones_combined_flat[p]['specifications'].get('Tests') and latest_phones_combined_flat[p]['specifications']['Tests'].get('Battery (old)') ])
clean_data['battery_endurance'] = [
    float(
        find_b_endurance.findall(latest_phones_combined_flat[p]['specifications']['Tests']['Battery (old)'][0])[0]) 
        if 
            latest_phones_combined_flat[p]['specifications'].get('Tests') and latest_phones_combined_flat[p]['specifications']['Tests'].get('Battery (old)')  
        else 
            (average_battery_endurance/average_battery_size)*float(find_battery.findall(latest_phones_combined_flat[p]['specifications']['Battery']['Type'][0])[0])
        for p in latest_phones_combined_flat]


# add weight
find_weight = re.compile(r"(\d+\.?\d+) g", re.IGNORECASE)
clean_data['weight'] = [
    float(x[0])
    if 
        latest_phones_combined_flat[p]['specifications']['Body'].get('Weight') and (x := find_weight.findall(latest_phones_combined_flat[p]['specifications']['Body']['Weight'][0])) 
    else 
        np.nan 
    for p in latest_phones_combined_flat]

# supply missing values
clean_data['weight'] = clean_data['weight'].fillna(clean_data['weight'].mean())

# add os
find = re.compile(r"Android|iOS|Windows|Blackberry|Symbian|Bada|Tizen|KaiOS|Feature phone|HarmonyOS|EMUI", re.IGNORECASE)
clean_data['os'] = ['Android' if (r := find.match(latest_phones_combined_flat[p]['os'])[0]).lower() == 'emui' else r for p in latest_phones_combined_flat]

# add release
clean_data['release'] = [latest_phones_combined_flat[p]['specifications']['Launch']['Announced'][0] if latest_phones_combined_flat[p]['specifications']['Launch'].get('Announced') else None for p in latest_phones_combined_flat]

# add screen size
find_screen_size = re.compile(r"(\d+\.?\d+) inches", re.IGNORECASE)
clean_data['screen_size'] = [
    float(x[0]) 
    if 
        latest_phones_combined_flat[p]['specifications']['Display'].get('Size') and (x := find_screen_size.findall(latest_phones_combined_flat[p]['specifications']['Display']['Size'][0])) 
    else 
        np.nan 
    for p in latest_phones_combined_flat]

# add screen resolution
find_screen_resolution = re.compile(r"(\d+) x (\d+)", re.IGNORECASE)

clean_data['screen_resolution'] = [
    float(x[0][0]) * float(x[0][1])
    if 
        latest_phones_combined_flat[p]['specifications']['Display'].get('Resolution') and len(x := find_screen_resolution.findall(latest_phones_combined_flat[p]['specifications']['Display']['Resolution'][0])) == 1
    else 
        np.nan 
    for p in latest_phones_combined_flat]

# add screen type
find_screen_type = re.compile(r"(\b(?:LCD|OLED|AMOLED|IPS|TFT|P-OLED|Super AMOLED|Super Retina XDR OLED)\b)", re.IGNORECASE)
clean_data['screen_type'] = [
    x[0]
    if 
        latest_phones_combined_flat[p]['specifications']['Display'].get('Type') and (x := find_screen_type.findall(latest_phones_combined_flat[p]['specifications']['Display']['Type'][0])) 
    else 
        'Unknown' 
    for p in latest_phones_combined_flat]

find_camera_resolution = re.compile(r"(\d+(?:\.\d*)?) MP", re.IGNORECASE)
# add camera count, if Main Camera has key 'Quad' or 'Triple' or 'Dual' or 'Single
clean_data[['camera_count_back', 'camera_list_resolution_back']] = [
    (  
        4, 
        '|'.join(find_camera_resolution.findall(x['Quad'][0])[:4])
     ) if (x := latest_phones_combined_flat[p]['specifications'].get('Main Camera')) and 'Quad' in x 
    else (
        3,
        '|'.join(find_camera_resolution.findall(x['Triple'][0])[:3])
        ) if x and 'Triple' in x 
    else (
        2,
        '|'.join(find_camera_resolution.findall(x['Dual'][0])[:2])
        ) if x and 'Dual' in x 
    else (
        1,
        '|'.join(find_camera_resolution.findall(x['Single'][0])[:1])
          ) if x and 'Single' in x 
    else (
        0,
        '0'
        ) for p in latest_phones_combined_flat]

# add camera count, if Front Camera has key 'Quad' or 'Triple' or 'Dual' or 'Single
clean_data[['camera_count_front', 'camera_list_resolution_front']] = [
    (  
        4, 
        '|'.join(find_camera_resolution.findall(x['Quad'][0])[:4])
     ) if (x := latest_phones_combined_flat[p]['specifications'].get('Selfie camera')) and 'Quad' in x 
    else (
        3,
        '|'.join(find_camera_resolution.findall(x['Triple'][0])[:3])
        ) if x and 'Triple' in x 
    else (
        2,
        '|'.join(find_camera_resolution.findall(x['Dual'][0])[:2])
        ) if x and 'Dual' in x 
    else (
        1,
        '|'.join(find_camera_resolution.findall(x['Single'][0]))
          ) if x and 'Single' in x 
    else (
        0,
        '0'
        ) for p in latest_phones_combined_flat]

# add chipset
clean_data['chipset'] = [
    x[0].split('(')[0].strip() 
    if 
        latest_phones_combined_flat[p]['specifications'].get('Platform') and 
        (x := latest_phones_combined_flat[p]['specifications']['Platform'].get('Chipset'))
    else 
        None 
    for p in latest_phones_combined_flat]


# add benchmark antutu
find_benchmark_antutu = re.compile(r"AnTuTu: (\d+)", re.IGNORECASE)
clean_data['benchmark_antutu'] = [
    int(x[0]) 
    if 
        latest_phones_combined_flat[p]['specifications'].get('Tests') and 
        latest_phones_combined_flat[p]['specifications']['Tests'].get('Performance') and
        (x := find_benchmark_antutu.findall(latest_phones_combined_flat[p]['specifications']['Tests']['Performance'][0]))
    else 
        np.nan 
    for p in latest_phones_combined_flat]


# Group the similar chipsets. Get the average of the benchmark, combining the similar chipsets
for chipset in clean_data['chipset'].value_counts().index:
    # fill only the null values
    clean_data.loc[(clean_data['chipset'] == chipset) , 'benchmark_antutu'] = clean_data.loc[(clean_data['chipset'] == chipset), 'benchmark_antutu'].fillna(clean_data.loc[clean_data['chipset'] == chipset, 'benchmark_antutu'].mean())

# fill the missing values
clean_data['benchmark_antutu'] = clean_data['benchmark_antutu'].fillna(clean_data['benchmark_antutu'].mean()).astype(int)


# add benchmark geekbench
find_benchmark_geekbench = re.compile(r"GeekBench: (\d+)", re.IGNORECASE)
clean_data['benchmark_geekbench'] = [
    int(x[0]) 
    if 
        latest_phones_combined_flat[p]['specifications'].get('Tests') and 
        latest_phones_combined_flat[p]['specifications']['Tests'].get('Performance') and
        (x := find_benchmark_geekbench.findall(latest_phones_combined_flat[p]['specifications']['Tests']['Performance'][0]))
    else 
        np.nan 
    for p in latest_phones_combined_flat]

# Group the similar chipsets. Get the average of the benchmark, combining the similar chipsets
for chipset in clean_data['chipset'].value_counts().index:
    # fill only the null values
    clean_data.loc[(clean_data['chipset'] == chipset) , 'benchmark_geekbench'] = clean_data.loc[(clean_data['chipset'] == chipset), 'benchmark_geekbench'].fillna(clean_data.loc[clean_data['chipset'] == chipset, 'benchmark_geekbench'].mean())

# fill the missing values
clean_data['benchmark_geekbench'] = clean_data['benchmark_geekbench'].fillna(clean_data['benchmark_geekbench'].mean()).astype(int)

# add gpu
clean_data['gpu'] = [
    x[0].split('(')[0].strip() 
    if 
        latest_phones_combined_flat[p]['specifications'].get('Platform') and 
        (x := latest_phones_combined_flat[p]['specifications']['Platform'].get('GPU'))
    else 
        None 
    for p in latest_phones_combined_flat]

# supply missing values from the non missing gpu, get the data from the same chipset
for gpu in clean_data[~clean_data['gpu'].isnull()]['gpu'].value_counts().index:
    # fill only the null values
    clean_data.loc[(clean_data['gpu'] == gpu) , 'chipset'] = clean_data.loc[(clean_data['gpu'] == gpu), 'chipset'].fillna(clean_data.loc[clean_data['gpu'] == gpu, 'chipset'].mode()[0])


# add values for the min and max of ram and storage
find_ram = re.compile(r"(\d+)GB (\d+)GB RAM", re.IGNORECASE)
clean_data[['storage_lst', 'ram_lst']] = [
    (  
        ['|'.join(set(tup)) for tup in zip(*x)]
     ) if (x := find_ram.findall(latest_phones_combined_flat[p]['specifications']['Memory']['Internal'][0])) 
    else (
        np.nan,
        np.nan
    ) for p in latest_phones_combined_flat]

# add storage type
find_storage_type = re.compile(r"(\b(?:eMMC|UFS|NVMe)\b)", re.IGNORECASE)
clean_data['storage_type'] = [
    x[0]
    if 
        latest_phones_combined_flat[p]['specifications']['Memory'].get('Other') and (x := find_storage_type.findall(latest_phones_combined_flat[p]['specifications']['Memory']['Other'][0])) 
    else 
        None
    for p in latest_phones_combined_flat]

# supply missing values
clean_data['storage_type'] = clean_data['storage_type'].fillna(clean_data['storage_type'].mode()[0])

# add lazada ratings
clean_data['lazada_ratings'] = [
    (latest_phones_combined_flat[p]['Lazada']['Ratings'] if x['Ratings'] != '' else 0)
    if (x := latest_phones_combined_flat[p].get('Lazada')) else 0 
    for p in latest_phones_combined_flat]

# add lazada reviews
clean_data['lazada_reviews'] = [
    (latest_phones_combined_flat[p]['Lazada']['Reviews'] if x['Reviews'] != '' else 0)
    if (x := latest_phones_combined_flat[p].get('Lazada')) else 0 
    for p in latest_phones_combined_flat]

# add build material
find_build_material = re.compile(r"(\b(?:glass|aluminum|plastic|titanium|ceramic|stainless steel|carbon fiber)\b)", re.IGNORECASE)
clean_data['build_material'] = [
    '|'.join(x)
    if 
        latest_phones_combined_flat[p]['specifications']['Body'].get('Build') and (x := find_build_material.findall(latest_phones_combined_flat[p]['specifications']['Body']['Build'][0])) 
    else 
        None 
    for p in latest_phones_combined_flat]

# fill the missing values
clean_data['build_material'] = clean_data['build_material'].fillna(clean_data['build_material'].mode()[0])

# euro to php
def euro_to_php(euro):
    if euro is None or euro == '':
        return None
    return float(euro) * 60.25

# dollar to php
def dollar_to_php(dollar):
    if dollar is None or dollar == '':
        return None
    return float(dollar) * 55.90


# add min and max price
find_price = re.compile(r"(\d+(?:\.\d*)?) EUR|\$\u2009(\d+(?:\.\d+)?)", re.IGNORECASE)

clean_data[['price_min', 'price_max']] = [
        [min(y := [
            euro_to_php(x[0][0]) or dollar_to_php(x[0][1]), 
            float(latest_phones_combined_flat[p]['Lazada']['Price']),
            float(latest_phones_combined_flat[p]['Lazada']['Original Price'])],
            ),
        max(y)]
    if 
        latest_phones_combined_flat[p]['specifications']['Misc'].get('Price') and (x:= find_price.findall(latest_phones_combined_flat[p]['specifications']['Misc']['Price'][0]))
    else
    
        (np.nan, np.nan)
    for p in latest_phones_combined_flat]

# supply missing values
clean_data['price_min'] = clean_data['price_min'].fillna(clean_data['price_min'].mean())
clean_data['price_max'] = clean_data['price_max'].fillna(clean_data['price_max'].mean())

# add camera resolution back recording
find_camera_resolution_back_recording = re.compile(r"(\d+)fps", re.IGNORECASE)

# add camera resolution front recording
find_camera_resolution_recording = re.compile(r"((?:8K|4K|1080p|720p).*.fps|4K|1080p|720p)", re.IGNORECASE)

ll = []
for p in latest_phones_combined_flat:
    if latest_phones_combined_flat[p]['specifications'].get('Main Camera') and latest_phones_combined_flat[p]['specifications']['Main Camera'].get('Video'):
        o = []
        for i in latest_phones_combined_flat[p]['specifications']['Main Camera']['Video'][0].split(','):
            if (x := find_camera_resolution_recording.findall(i)):
                o.append(x[0])
            elif i.lower().find('yes') != -1:
                o.append('1080p@30fps')

        if o:
            ll.append('|'.join(o))
        else:
            ll.append(None)
    else:
        ll.append(None)
clean_data['camera_resolution_back_recording'] = ll

# add camera resolution front recording
ll = []
for p in latest_phones_combined_flat:
    if latest_phones_combined_flat[p]['specifications'].get('Selfie camera') and latest_phones_combined_flat[p]['specifications']['Selfie camera'].get('Video'):
        o = []
        for i in latest_phones_combined_flat[p]['specifications']['Selfie camera']['Video'][0].split(','):
            if (x := find_camera_resolution_recording.findall(i)):
                o.append(x[0])
            elif i.lower().find('yes') != -1:
                o.append('1080p@30fps')

        if o:
            ll.append('|'.join(o))
        else:
            ll.append(None)
    else:
        ll.append(None)
clean_data['camera_resolution_front_recording'] = ll

# add has wireless
find_wifi = re.compile(r".*\|.*", re.IGNORECASE)
ll = []
for p in latest_phones_combined_flat:
    if latest_phones_combined_flat[p]['specifications'].get('Comms') and latest_phones_combined_flat[p]['specifications']['Comms'].get('WLAN'):
        done = False
        if (x := latest_phones_combined_flat[p]['specifications']['Comms']['WLAN'][0].split(',')[0]).lower() != 'yes':
            for i in x.split():
                if find_wifi.match(i := i.replace('/', '|')):
                    ll.append(i)
                    done = True
                    break

            if not done:
                ll.append(None)
        else:
            ll.append(None)
    else:
        ll.append('No')
clean_data['has_wifi'] = ll

# supply missing values
clean_data['has_wifi'] = clean_data['has_wifi'].fillna(clean_data['has_wifi'].mode()[0])

# add has bluetooth
ll = []
for p in latest_phones_combined_flat:
    if latest_phones_combined_flat[p]['specifications'].get('Comms') and latest_phones_combined_flat[p]['specifications']['Comms'].get('Bluetooth'):
        if (x:=latest_phones_combined_flat[p]['specifications']['Comms']['Bluetooth'][0].split(',')[0]).lower() != 'no' and x != 'yes':
            ll.append(x)
        elif x.lower() == 'yes':
            ll.append(None)
        else:
            ll.append('No')
    else:
        ll.append('No')
clean_data['has_bluetooth'] = ll

# supply missing values
clean_data['has_bluetooth'] = clean_data['has_bluetooth'].fillna(clean_data['has_bluetooth'].mode()[0])

# add has cardslot
clean_data['has_cardslot'] = ['Yes' if (x := latest_phones_combined_flat[p]['specifications']['Memory'].get('Card slot')) and x[0].lower() != 'no' else 'No' for p in latest_phones_combined_flat]

# add details, (this is the combinatin of all the data)
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)







In [43]:
clean_data.head(10)

Unnamed: 0_level_0,brand,colors,model,battery_size,battery_endurance,weight,os,release,screen_size,screen_resolution,...,storage_lst,storage_type,price_min,price_max,lazada_ratings,lazada_reviews,build_material,has_wifi,has_bluetooth,has_cardslot
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
apple_iphone_11-9848,Apple,Black|Green|Yellow|Purple|Red|White,iPhone 11,3110,94.0,194.0,iOS,2019,6.1,1483776.0,...,64|128|256,NVMe,12745.2,37990.0,4.939130434782609,805,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No
apple_iphone_11_pro-9847,Apple,Matte Space Gray|Matte Silver|Matte Gold|Matte...,iPhone 11 Pro,3046,86.0,188.0,iOS,2019,5.8,2740500.0,...,64|256|512,NVMe,16155.1,34990.0,0.0,0,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No
apple_iphone_11_pro_max-9846,Apple,Matte Space Gray|Matte Silver|Matte Gold|Matte...,iPhone 11 Pro Max,3969,102.0,226.0,iOS,2019,6.5,3338496.0,...,64|256|512,NVMe,18600.725,43460.0,0.0,0,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No
apple_iphone_12-10509,Apple,Black|White|Red|Green|Blue|Purple,iPhone 12,2815,84.0,164.0,iOS,2020,6.1,2962440.0,...,64|128|256,NVMe,13886.678,53990.0,4.92479674796748,492,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No
apple_iphone_12_mini-10510,Apple,Black|White|Red|Green|Blue|Purple,iPhone 12 mini,2227,69.0,135.0,iOS,2020,5.4,2527200.0,...,64|128|256,NVMe,13807.3,25990.0,5.0,22,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No
apple_iphone_12_pro-10508,Apple,Silver|Graphite|Gold|Pacific Blue,iPhone 12 Pro,2815,81.0,189.0,iOS,2020,6.1,2962440.0,...,128|256|512,NVMe,21571.81,44990.0,5.0,10,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No
apple_iphone_12_pro_max-10237,Apple,Silver|Graphite|Gold|Pacific Blue,iPhone 12 Pro Max,3687,95.0,228.0,iOS,2020,6.7,3566952.0,...,128|256|512,NVMe,24917.425,56180.0,0.0,0,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No
apple_iphone_13-11103,Apple,Starlight|Midnight|Blue|Pink|Red|Green,iPhone 13,3240,89.0,174.0,iOS,2021,6.1,2962440.0,...,128|256|512,NVMe,21687.523,52990.0,4.940397350993377,151,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No
apple_iphone_13_mini-11104,Apple,Starlight|Midnight|Blue|Pink|Red|Green,iPhone 13 mini,2438,75.0,141.0,iOS,2021,5.4,2527200.0,...,128|256|512,NVMe,23464.025,52490.0,4.853658536585366,41,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No
apple_iphone_13_pro-11102,Apple,Graphite|Gold|Silver|Sierra Blue|Alpine Green,iPhone 13 Pro,3095,85.0,204.0,iOS,2021,6.1,2962440.0,...,128|256|512,NVMe,28786.823,68990.0,5.0,2,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No


In [44]:
clean_data.isna().sum()


brand                                 0
colors                                0
model                                 0
battery_size                          0
battery_endurance                     0
weight                                0
os                                    0
release                               0
screen_size                           0
screen_resolution                     0
screen_type                           0
camera_count_back                     0
camera_count_front                    0
camera_list_resolution_back           0
camera_list_resolution_front          0
camera_resolution_back_recording     26
camera_resolution_front_recording    39
chipset                              17
benchmark_antutu                      0
benchmark_geekbench                   0
gpu                                  31
ram_lst                              15
storage_lst                          15
storage_type                          0
price_min                             0


In [38]:


# remove the data with null values
clean_data = clean_data.dropna()


# remove the data with '' values
clean_data = clean_data.replace('', np.nan)

# find nan values
clean_data.isna().sum()


brand                                0
colors                               0
model                                0
battery_size                         0
battery_endurance                    0
weight                               0
os                                   0
release                              0
screen_size                          0
screen_resolution                    0
screen_type                          0
camera_count_back                    0
camera_count_front                   0
camera_list_resolution_back          0
camera_list_resolution_front         3
camera_resolution_back_recording     0
camera_resolution_front_recording    0
chipset                              0
benchmark_antutu                     0
benchmark_geekbench                  0
gpu                                  0
ram_lst                              0
storage_lst                          0
storage_type                         0
price_min                            0
price_max                

In [45]:

# supply missing values
clean_data['camera_list_resolution_front'] = clean_data['camera_list_resolution_front'].fillna(clean_data['camera_list_resolution_front'].mode()[0])


clean_data['price_avg'] = clean_data[['price_min', 'price_max']].mean(axis=1)

clean_data['price_range'] = pd.cut(clean_data['price_avg'], bins=[0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 100000000], labels=['<10k', '10k-20k', '20k-30k', '30k-40k', '40k-50k', '50k-60k', '60k-70k', '70k-80k', '80k-90k', '90k-100k', '>100k'])

    



# save the data
clean_data.to_csv('clean_phone_data_final.csv')

print(clean_data.shape, clean_data.isna().sum().sum())
clean_data.head()

(884, 34) 143


Unnamed: 0_level_0,brand,colors,model,battery_size,battery_endurance,weight,os,release,screen_size,screen_resolution,...,price_min,price_max,lazada_ratings,lazada_reviews,build_material,has_wifi,has_bluetooth,has_cardslot,price_avg,price_range
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
apple_iphone_11-9848,Apple,Black|Green|Yellow|Purple|Red|White,iPhone 11,3110,94.0,194.0,iOS,2019,6.1,1483776.0,...,12745.2,37990.0,4.939130434782609,805,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No,25367.6,20k-30k
apple_iphone_11_pro-9847,Apple,Matte Space Gray|Matte Silver|Matte Gold|Matte...,iPhone 11 Pro,3046,86.0,188.0,iOS,2019,5.8,2740500.0,...,16155.1,34990.0,0.0,0,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No,25572.55,20k-30k
apple_iphone_11_pro_max-9846,Apple,Matte Space Gray|Matte Silver|Matte Gold|Matte...,iPhone 11 Pro Max,3969,102.0,226.0,iOS,2019,6.5,3338496.0,...,18600.725,43460.0,0.0,0,Glass|glass|glass|glass|stainless steel,a|b|g|n|ac|6,5.0,No,31030.3625,30k-40k
apple_iphone_12-10509,Apple,Black|White|Red|Green|Blue|Purple,iPhone 12,2815,84.0,164.0,iOS,2020,6.1,2962440.0,...,13886.678,53990.0,4.92479674796748,492,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No,33938.339,30k-40k
apple_iphone_12_mini-10510,Apple,Black|White|Red|Green|Blue|Purple,iPhone 12 mini,2227,69.0,135.0,iOS,2020,5.4,2527200.0,...,13807.3,25990.0,5.0,22,Glass|glass|glass|glass|aluminum,a|b|g|n|ac|6,5.0,No,19898.65,10k-20k


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

def get_top_recommendation(non_existing_phone_specs):
    # Sample data
    data = pd.read_csv('clean_phone_data_final.csv')
    df = pd.DataFrame(data)
    
    
    # make all columns string
    df = df.astype(str)

    # Define the non-existing phone specs
    non_existing_phone_specs = non_existing_phone_specs

    # Combine the pipelines only for those non-existing phone specs keys
    feature_union_keys = []
    for key in non_existing_phone_specs.keys():
        if key in df.columns:
            feature_union_keys.append(
                (key, Pipeline([
                    ('selector', FunctionTransformer(lambda x: x[key], validate=False)),
                    ('tfidf', TfidfVectorizer(stop_words='english'))
                ]))
            )

    # Combine the pipelines using FeatureUnion
    feature_union = FeatureUnion(feature_union_keys)

    # Fit and transform the data
    feature_vectors = feature_union.fit_transform(df)

    # Now feature_vectors contains TF-IDF representations for both 'brand' and 'model'
    from sklearn.neighbors import NearestNeighbors

    # Create a DataFrame for the non-existing phone specs
    non_existing_df = pd.DataFrame([non_existing_phone_specs])

    # Transform the non-existing phone specs using the previously defined FeatureUnion
    non_existing_feature_vectors = feature_union.transform(non_existing_df)

    # Create a NearestNeighbors model
    knn = NearestNeighbors(n_neighbors=10, metric='minkowski')

    # Fit the feature vectors to the knn model
    knn.fit(feature_vectors)

    # Find the nearest neighbors for the non-existing phone specs
    distances, indices = knn.kneighbors(non_existing_feature_vectors)

    # Print top recommendation
    top_recommendation = df.iloc[indices[0]]
    return top_recommendation

# Example usage:
non_existing_phone_specs = {
    'storage_avg': '128',
    'price_range': '<10k',
}

get_top_recommendation(non_existing_phone_specs)[:10]





Unnamed: 0,phone,brand,colors,model,battery_size,battery_endurance,weight,os,release,screen_size,...,price_max,lazada_ratings,lazada_reviews,build_material,has_wifi,has_bluetooth,has_cardslot,details,price_avg,price_range
37,blackview_a95-11572,Blackview,Black|Gold|Blue,A95,4380,90.29325392799404,195.0,Android,2022,6.53,...,11447.5,5.0,8,Glass|plastic|plastic,b|g|n|ac,4.2,Yes,//www.lazada.com.ph/products/global-version-bl...,7923.25,<10k
55,blackview_oscal_c70-12267,Blackview,Dim Forest Green|Summer Sky Blue|Stardust Grey,Oscal C70,5180,106.78517245365504,192.0,Android,2023,6.56,...,9999.0,0.0,0,Glass|plastic|plastic,a|b|g|n|ac,5.0,Yes,//www.lazada.com.ph/products/blackview-oscal-c...,7409.5,<10k
60,coolpad_cool_20-11620,Coolpad,Black|Blue|White,Cool 20,4500,92.76704170684316,199.6,Android,2021,6.52,...,11698.0,5.0,7,Glass|plastic|plastic,a|b|g|n|ac,5.0,Yes,//www.lazada.com.ph/products/2021-coolpad-cool...,8560.25,<10k
32,blackview_a100-11571,Blackview,Gray|Blue|Green|Pink,A100,4680,96.47772337511692,195.0,Android,2021,6.67,...,10845.0,0.0,0,Glass|plastic|plastic,a|b|g|n|ac,4.2,Yes,//www.lazada.com.ph/products/blackview-a100-12...,9803.5,<10k
98,doogee_x97_pro-11815,Doogee,Black|Blue|Green,X97 Pro,4200,86.5825722597203,205.747181372549,Android,2022,6.0,...,12000.0,0.0,0,Glass|plastic|plastic,a|b|g|n|ac,5.0,Yes,//www.lazada.com.ph/products/doogee-x97-pro-4g...,8649.5,<10k
56,blackview_oscal_c80-11872,Blackview,Early Sunny Snow|Midnight Black|Navy Blue,Oscal C80,5180,106.78517245365504,188.0,Android,2022,6.5,...,12237.0,0.0,0,Glass|plastic|plastic,a|b|g|n|ac,5.0,Yes,//www.lazada.com.ph/products/blackview-oscal-c...,9682.005,<10k
34,blackview_a50-11588,Blackview,Black|Green|Blue|Gradient,A50,4280,88.2317641122864,156.0,Android,2022,6.01,...,7230.0,0.0,0,Glass|plastic|plastic,b|g|n|ac,5.0,Yes,//www.lazada.com.ph/products/blackview-a50-3gb...,6364.5,<10k
101,google_pixel_4-9896,Google,Clearly White|Just Black|Oh So Orange,Pixel 4,2800,62.0,162.0,Android,2019,5.7,...,9500.0,4.754716981132075,53,Glass|Glass|glass|Glass|aluminum,a|b|g|n|ac,5.0,No,//www.lazada.com.ph/products/google-pixel-4-an...,7656.5205,<10k
59,coolpad_cool_20+-12421,Coolpad,Black|Blue,Cool 20+,4500,92.76704170684316,197.1,Android,2023,6.52,...,7832.5,0.0,0,Glass|glass,a|b|g|n|ac,5.0,Yes,//www.lazada.com.ph/products/coolpad-cool-20-h...,6765.75,<10k
27,asus_rog_phone_6d_ultimate-11880,Asus,Space Gray,ROG Phone 6D Ultimate,6000,103.0,247.0,Android,2022,6.78,...,15000.0,0.0,0,Glass|Glass|glass|Glass|aluminum,a|b|g|n|ac|6e,5.3,No,//www.lazada.com.ph/products/asus-rog-phone-6d...,7555.9,<10k
