In [2]:
import numpy as np
import pandas as pd

In [3]:
smartphones = pd.read_csv("smartphones.csv")
smartphones.head()

Unnamed: 0,model,price,rating,sim,processor,ram,battery,display,camera,card,os
0,OnePlus 11 5G,"₹54,999",89.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Snapdragon 8 Gen2, Octa Core, 3.2 GHz Processor","12 GB RAM, 256 GB inbuilt",5000 mAh Battery with 100W Fast Charging,"6.7 inches, 1440 x 3216 px, 120 Hz Display wit...",50 MP + 48 MP + 32 MP Triple Rear & 16 MP Fron...,Memory Card Not Supported,Android v13
1,OnePlus Nord CE 2 Lite 5G,"₹19,989",81.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi","Snapdragon 695, Octa Core, 2.2 GHz Processor","6 GB RAM, 128 GB inbuilt",5000 mAh Battery with 33W Fast Charging,"6.59 inches, 1080 x 2412 px, 120 Hz Display wi...",64 MP + 2 MP + 2 MP Triple Rear & 16 MP Front ...,"Memory Card (Hybrid), upto 1 TB",Android v12
2,Samsung Galaxy A14 5G,"₹16,499",75.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi","Exynos 1330, Octa Core, 2.4 GHz Processor","4 GB RAM, 64 GB inbuilt",5000 mAh Battery with 15W Fast Charging,"6.6 inches, 1080 x 2408 px, 90 Hz Display with...",50 MP + 2 MP + 2 MP Triple Rear & 13 MP Front ...,"Memory Card Supported, upto 1 TB",Android v13
3,Motorola Moto G62 5G,"₹14,999",81.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi","Snapdragon 695, Octa Core, 2.2 GHz Processor","6 GB RAM, 128 GB inbuilt",5000 mAh Battery with Fast Charging,"6.55 inches, 1080 x 2400 px, 120 Hz Display wi...",50 MP + 8 MP + 2 MP Triple Rear & 16 MP Front ...,"Memory Card (Hybrid), upto 1 TB",Android v12
4,Realme 10 Pro Plus,"₹24,999",82.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi","Dimensity 1080, Octa Core, 2.6 GHz Processor","6 GB RAM, 128 GB inbuilt",5000 mAh Battery with 67W Fast Charging,"6.7 inches, 1080 x 2412 px, 120 Hz Display wit...",108 MP + 8 MP + 2 MP Triple Rear & 16 MP Front...,Memory Card Not Supported,Android v13


In [4]:
df = smartphones.copy()

## Data Assessing

### Quality Issues

1. **model** - Some brands are written differently (e.g., OPPO) - `consistency`
2. **price** - Contains unnecessary '₹' symbol - `validity`
3. **price** - Contains commas between numbers - `validity`
4. **price** - Phone Namotel has a price of 99 - `accuracy`
5. **ratings** - Missing values - `completeness`
6. **processor** - Incorrect values for some Samsung phones (rows: 642, 647, 649, 659, 667, 701, 750, 759, 819, 859, 883, 884, 919, 927, 929, 932, 1002) - `validity`
7. **model** - Contains an iPod entry (row 756) - `validity`
8. **memory** - Incorrect values in rows (441, 485, 534, 553, 584, 610, 613, 642, 647, 649, 659, 667, 701, 750, 759, 819, 859, 884, 919, 927, 929, 932, 990, 1002) - `validity`
9. **battery** - Incorrect values in rows (113, 151, 309, 365, 378, 441, 450, 553, 584, 610, 613, 630, 642, 647, 649, 659, 667, 701, 750, 756, 759, 764, 819, 855, 859, 884, 915, 916, 927, 929, 932, 990, 1002) - `validity`
10. **display** - Missing frequency information - `completeness`
11. **display** - Incorrect values in rows (378, 441, 450, 553, 584, 610, 613, 630, 642, 647, 649, 659, 667, 701, 750, 759, 764, 819, 859, 884, 915, 916, 927, 929, 932, 990, 1002) - `validity`
12. **model** - Foldable phone information is scattered - `validity`
13. **camera** - Uses terms like Dual, Triple, and Quad to represent the number of cameras; front and rear cameras are separated by '&' - `validity`
14. **camera** - Issues with rows (100, 113, 151, 157, 161, 238, 273, 308, 309, 323, 324, 365, 367, 378, 394, 441, 450, 484, 506, 534, 553, 571, 572, 575, 584, 610, 613, 615, 630, 642, 647, 649, 659, 667, 684, 687, 705, 711, 723, 728, 750, 756, 759, 764, 792, 819, 846, 854, 855, 858, 883, 884, 896, 915, 916, 927, 929, 932, 945, 956, 990, 995, 1002, 1016) - `validity`
15. **card** - Sometimes contains information about OS and camera - `validity`
16. **os** - Sometimes contains information about Bluetooth and FM radio - `validity`
17. **os** - Issues with rows (324, 378) - `validity`
18. **os** - Sometimes contains OS version names like Lollipop - `consistency`
19. **camera**, **card**, **os** - Missing values - `completeness`
20. **price**, **rating** - Incorrect data types - `validity`

### Tidiness Issues

1. **sim** - Can be split into three columns: `has_5G`, `has_NFC`, `has_IR_Blaster`
2. **ram** - Can be split into two columns: `RAM` and `ROM`
3. **processor** - Can be split into `processor_name`, `cores`, and `cpu_speed`
4. **battery** - Can be split into `battery_capacity` and `fast_charging_available`
5. **display** - Can be split into `size`, `resolution_width`, `resolution_height`, and `frequency`
6. **camera** - Can be split into `front_camera` and `rear_camera`
7. **card** - Can be split into `supported` and `extended_upto`


In [5]:
# Handling price feature
df['price'] = df['price'].str.replace("₹", "").str.replace(",", "").astype(np.int64)
df['price'].sample(5)

937     9999
648    16685
546    20990
406    16999
314    16999
Name: price, dtype: int64

In [6]:
# In CSV file, index is being started from 2 but in this notebook(dataframe) it is from 0 - For the consistency
df = df.reset_index()
df['index'] = df['index'] + 2

In [7]:
# Handling non - smartphones phones

# The smartphones with some kind of erros
processor_rows = set((642,647,649,659,667,701,750,759,819,859,883,884,919,927,929,932,1002))
ram_rows = set((441,485,534,553,584,610,613,642,647,649,659,667,701,750,759,819,859,884,919,927,929,932,990,1002))
battery_rows = set((113,151,309,365,378,441,450,553,584,610,613,630,642,647,649,659,667,701,750,756,759,764,819,855,859,884,915,916,927,929,932,990,1002))
display_rows = set((378,441,450,553,584,610,613,630,642,647,649,659,667,701,750,759,764,819,859,884,915,916,927,929,932,990,1002))
camera_rows = set((100,113,151,157,161,238,273,308,309,323,324,365,367,378,394,441,450,484,506,534,553,571,572,575,584,610,613,615,630,642,647,649,659,667,684,687,705,711,723,728,750,756,759,764,792,819,846,854,855,858,883,884,896,915,916,927,929,932,945,956,990,995,1002,1016 ))

In [8]:
# Below are the records which contains certain error
df[df['index'].isin(processor_rows | ram_rows | battery_rows | display_rows | camera_rows)]

Unnamed: 0,index,model,price,rating,sim,processor,ram,battery,display,camera,card,os
98,100,Vivo X Fold 5G,106990,,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Snapdragon 8 Gen1, Octa Core, 3 GHz Processor","12 GB RAM, 256 GB inbuilt",4600 mAh Battery with 66W Fast Charging,"8.03 inches, 1916 x 2160 px, 120 Hz Display",Foldable Display,50 MP Quad Rear & 16 MP Front Camera,Android v12
111,113,Apple iPhone 12,51999,74.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Bionic A14, Hexa Core, 3.1 GHz Processor","4 GB RAM, 64 GB inbuilt","6.1 inches, 1170 x 2532 px Display with Large ...",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14,No FM Radio
149,151,Apple iPhone 12 Mini,40999,74.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Bionic A14, Hexa Core, 3.1 GHz Processor","4 GB RAM, 64 GB inbuilt","5.4 inches, 1080 x 2340 px Display",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14,No FM Radio
155,157,Nokia 2780 Flip,4990,,"Dual Sim, 3G, 4G, Wi-Fi","Snapdragon QM215, Quad Core, 1.3 GHz Processor","4 GB RAM, 512 MB inbuilt",1450 mAh Battery,"2.7 inches, 240 x 320 px Display",Dual Display,5 MP Rear Camera,"Memory Card Supported, upto 32 GB"
159,161,Oppo Find N2 5G,94990,,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Snapdragon 8+ Gen1, Octa Core, 3.2 GHz Processor","12 GB RAM, 256 GB inbuilt",4520 mAh Battery with 67W Fast Charging,"7.1 inches, 1792 x 1920 px, 120 Hz Display wit...","Foldable Display, Dual Display",50 MP + 48 MP + 32 MP Triple Rear & 32 MP + 32...,Memory Card Not Supported
...,...,...,...,...,...,...,...,...,...,...,...,...
954,956,Vivo X Fold 5G (12GB RAM + 512GB),118990,,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Snapdragon 8 Gen1, Octa Core, 3 GHz Processor","12 GB RAM, 512 GB inbuilt",4600 mAh Battery with 66W Fast Charging,"8.03 inches, 1916 x 2160 px, 120 Hz Display",Foldable Display,50 MP Quad Rear & 16 MP Front Camera,Android v12
988,990,Nokia 5310 Dual Sim,3399,,Dual Sim,"8 MB RAM, 16 MB inbuilt",1200 mAh Battery,"2.4 inches, 240 x 320 px Display",0.3 MP Rear Camera,"Memory Card Supported, upto 32 GB",Bluetooth,Browser
993,995,Huawei Mate X,169000,,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Kirin 990, Octa Core, 2.86 GHz Processor","8 GB RAM, 512 GB inbuilt",4500 mAh Battery with 55W Fast Charging,"8 inches, 2200 x 2480 px Display",Foldable Display,48 MP Quad Rear Camera,"Memory Card (Hybrid), upto 256 GB"
1000,1002,XTouch F40 Flip,1999,,Dual Sim,No 3G,No Wifi,"32 MB RAM, 32 MB inbuilt",800 mAh Battery,"1.77 inches, 240 x 320 px Display",Dual Display,1.3 MP Rear Camera


In [9]:
# These are the records which contais error in all of its columns - Basically "Shifting Error" reflected directly from website
df[df['index'].isin(processor_rows & ram_rows & battery_rows & display_rows & camera_rows)]

# These are feature phones and not the smartphones - so we can remove them

Unnamed: 0,index,model,price,rating,sim,processor,ram,battery,display,camera,card,os
640,642,Nokia 105 Plus,1299,,Dual Sim,"4 MB RAM, 4 MB inbuilt",800 mAh Battery,"1.77 inches, 128 x 160 px Display",No Rear Camera,"Memory Card Supported, upto 32 GB",Bluetooth,
645,647,Nokia 2760 Flip,5490,,"Dual Sim, 3G, 4G, Wi-Fi",1450 mAh Battery,"3.6 inches, 240 x 320 px Display",5 MP Rear & 5 MP Front Camera,"Memory Card Supported, upto 32 GB",Kaios v3.0,Bluetooth,
647,649,Motorola Moto A10,1339,,Dual Sim,"4 MB RAM, 4 MB inbuilt",1750 mAh Battery,"1.8 inches, 160 x 128 px Display",No Rear Camera,"Memory Card Supported, upto 32 GB",,
657,659,Zanco Tiny T1,2799,,Single Sim,"32 MB RAM, 32 MB inbuilt",200 mAh Battery,"0.49 inches, 64 x 32 px Display",No Rear Camera,No FM Radio,Bluetooth,
665,667,itel it2163S,958,,Dual Sim,"4 MB RAM, 4 MB inbuilt",1200 mAh Battery,"1.8 inches, 160 x 128 px Display",No Rear Camera,"Memory Card Supported, upto 32 GB",Bluetooth,
748,750,Nokia 400 4G,3290,,"Dual Sim, 4G, VoLTE, Wi-Fi",2000 mAh Battery,"2.4 inches, 240 x 320 px Display",0.3 MP Rear & 0.3 MP Front Camera,"Memory Card Supported, upto 64 GB",Bluetooth,Browser,
757,759,Karbonn KU3i,995,,Dual Sim,"52 MB RAM, 32 MB inbuilt",1000 mAh Battery,"1.8 inches, 128 x 160 px Display",No Rear Camera,"Memory Card Supported, upto 16 GB",Bluetooth,
817,819,itel Magic X,2239,,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi",No 3G,T117,"48 MB RAM, 128 MB inbuilt",1200 mAh Battery,"2.4 inches, 240 x 320 px Display",1.3 MP Rear Camera,"Memory Card Supported, upto 64 GB"
882,884,Nokia 5710 XpressAudio,4799,,"Dual Sim, 3G, 4G",No Wifi,Unisoc T107,"48 MB RAM, 128 MB inbuilt",1450 mAh Battery,"2.4 inches, 240 x 320 px Display",0.3 MP Rear Camera,"Memory Card Supported, upto 32 GB"
925,927,Nokia 3310 4G,3999,,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","256 MB RAM, 512 MB inbuilt",1200 mAh Battery,"2.4 inches, 240 x 320 px Display",2 MP Rear Camera,"Memory Card Supported, upto 32 GB",Bluetooth,Browser


In [10]:
# The range of price of all the smartphones are above 3400
df = df[df['price'] >= 3400]
df.sample(5)

Unnamed: 0,index,model,price,rating,sim,processor,ram,battery,display,camera,card,os
783,785,Vivo V23e,26990,82.0,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","Helio G96, Octa Core, 2.05 GHz Processor","8 GB RAM, 128 GB inbuilt",4050 mAh Battery with 44W Fast Charging,"6.44 inches, 1080 x 2400 px Display with Water...",64 MP + 8 MP + 2 MP Triple Rear & 50 MP Front ...,Memory Card Not Supported,Android v11
51,53,iQOO Neo 7 5G,29999,82.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Dimensity 8200, Octa Core, 3.1 GHz Processor","8 GB RAM, 128 GB inbuilt",5000 mAh Battery with 120W Fast Charging,"6.78 inches, 1080 x 2400 px, 120 Hz Display wi...",64 MP + 2 MP + 2 MP Triple Rear & 16 MP Front ...,Android v13,No FM Radio
993,995,Huawei Mate X,169000,,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Kirin 990, Octa Core, 2.86 GHz Processor","8 GB RAM, 512 GB inbuilt",4500 mAh Battery with 55W Fast Charging,"8 inches, 2200 x 2480 px Display",Foldable Display,48 MP Quad Rear Camera,"Memory Card (Hybrid), upto 256 GB"
475,477,Samsung Galaxy A82 5G,39990,86.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Snapdragon 855+, Octa Core, 2.96 GHz Processor","6 GB RAM, 128 GB inbuilt",4500 mAh Battery with 25W Fast Charging,"6.71 inches, 1440 x 3200 px, 120 Hz Display wi...",64 MP + 12 MP + 5 MP Triple Rear & 10 MP Front...,"Memory Card Supported, upto 1 TB",Android v11
189,191,LeEco S1 Pro,10999,65.0,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","Tiger T7150, Quad Core, 1.8 GHz Processor","8 GB RAM, 128 GB inbuilt",5000 mAh Battery with 10W Fast Charging,"6.5 inches, 720 x 1600 px Display with Small N...",13 MP + Depth Sensor Dual Rear & 5 MP Front Ca...,Android v11,No FM Radio


In [11]:
# Handling processor feature
df[df['index'].isin(processor_rows)]

Unnamed: 0,index,model,price,rating,sim,processor,ram,battery,display,camera,card,os
645,647,Nokia 2760 Flip,5490,,"Dual Sim, 3G, 4G, Wi-Fi",1450 mAh Battery,"3.6 inches, 240 x 320 px Display",5 MP Rear & 5 MP Front Camera,"Memory Card Supported, upto 32 GB",Kaios v3.0,Bluetooth,
857,859,LG Folder 2,11999,,"Single Sim, 3G, 4G, Wi-Fi","1 GB RAM, 8 GB inbuilt",1470 mAh Battery,"2.8 inches, 240 x 320 px Display",2 MP Rear Camera,Memory Card Supported,Bluetooth,
882,884,Nokia 5710 XpressAudio,4799,,"Dual Sim, 3G, 4G",No Wifi,Unisoc T107,"48 MB RAM, 128 MB inbuilt",1450 mAh Battery,"2.4 inches, 240 x 320 px Display",0.3 MP Rear Camera,"Memory Card Supported, upto 32 GB"
925,927,Nokia 3310 4G,3999,,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","256 MB RAM, 512 MB inbuilt",1200 mAh Battery,"2.4 inches, 240 x 320 px Display",2 MP Rear Camera,"Memory Card Supported, upto 32 GB",Bluetooth,Browser


In [12]:
# Since all these phones are feature phones, we can drop them
df = df.drop(index = [645, 857, 882, 925])
df[df['index'].isin(processor_rows)]

Unnamed: 0,index,model,price,rating,sim,processor,ram,battery,display,camera,card,os


In [13]:
# Handling RAM feature
df[df['index'].isin(ram_rows)]

Unnamed: 0,index,model,price,rating,sim,processor,ram,battery,display,camera,card,os
439,441,Apple iPhone SE 3 2022,43900,,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Bionic A15, Hexa Core, 3.22 GHz Processor",64 GB inbuilt,"4.7 inches, 750 x 1334 px Display",12 MP Rear & 7 MP Front Camera,Memory Card Not Supported,iOS v15,No FM Radio
483,485,Huawei Mate 50 RS Porsche Design,239999,81.0,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi, NFC, IR Blaster","Snapdragon 8+ Gen1, Octa Core, 3.2 GHz Processor",512 GB inbuilt,4700 mAh Battery with 66W Fast Charging,"6.74 inches, 1212 x 2616 px, 120 Hz Display",50 MP + 48 MP + 13 MP Triple Rear & 13 MP Fron...,"Memory Card (Hybrid), upto 256 GB",Hongmeng OS v3.0
582,584,Nokia 8210 4G,3749,,"Dual Sim, 3G, 4G",No Wifi,Unisoc T107,"48 MB RAM, 128 MB inbuilt",1450 mAh Battery,"2.8 inches, 240 x 320 px Display",0.3 MP Rear Camera,"Memory Card Supported, upto 32 GB"


In [14]:
# Nokia 8210 4G is a feature phone - lets drop it
df = df.drop(index = [582])

# Rest 2 phones are smart phone but the RAM is not mentioned in the RAM column, only ROM is mentioned - we will handle them while splitting the columns into RAM and ROM

In [15]:
df = df.drop([376,754]) # Dropping one IPod and one nokia feature phone

In [16]:
# All the IPhones have missing values in the battery column but they have extra feature - FM radio -> So that we have to right shift the corrosponding data by 1 place
right_shift_df = df[df['index'].isin(battery_rows)].copy()
df.loc[df['index'].isin(battery_rows), df.columns[7:]] = right_shift_df.iloc[:, 7:].shift(1, axis=1)
# df.loc[df['index'].isin(battery_rows)].iloc[:, 7:] = right_shift_df.iloc[:, 7:].shift(1, axis=1) -> This will create a copy

In [17]:
df.loc[df['index'].isin(battery_rows)].iloc[:, 7:]

Unnamed: 0,battery,display,camera,card,os
111,,"6.1 inches, 1170 x 2532 px Display with Large ...",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14
149,,"5.4 inches, 1080 x 2340 px Display",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14
307,,"6.1 inches, 1170 x 2532 px Display with Large ...",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14
363,,"5.4 inches, 1080 x 2340 px Display",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14
439,,"4.7 inches, 750 x 1334 px Display",12 MP Rear & 7 MP Front Camera,Memory Card Not Supported,iOS v15
448,,"6.06 inches, 1170 x 2532 px, 120 Hz Display wi...",50 MP + 12 MP + 12 MP Triple Rear & 12 MP Fron...,Memory Card Not Supported,iOS v15
628,,"6.1 inches, 1170 x 2532 px Display with Large ...",12 MP + 12 MP + 12 MP Triple Rear & 12 MP Fron...,Memory Card Not Supported,iOS v14.0
762,,"6.1 inches, 750 x 1580 px Display",12 MP Rear & 10.8 MP Front Camera,Memory Card Not Supported,iOS v16
853,,"6.1 inches, 1170 x 2532 px Display with Large ...",12 MP + 12 MP + 12 MP Triple Rear & 12 MP Fron...,Memory Card Not Supported,iOS v14.0
913,,"5.4 inches, 1080 x 2340 px Display",12 MP + 12 MP Dual Rear & 12 MP Front Camera,Memory Card Not Supported,iOS v14


In [18]:
df = df.drop(index = [155, 271]) # Removing feature phones

# The problem in camera feature is some values are actual values like 12MP.. and others are Dual Camera Quad Camera...
camera_records_df = df[df['index'].isin(camera_rows)]

In [19]:
# Based on MPwe can filter and left shift the data
new_df = camera_records_df.loc[~ camera_records_df['camera'].str.contains("MP"), ['camera', 'card', 'os']].shift(-1, axis = 1)
df.loc[new_df.index, new_df.columns] = new_df 

In [20]:
df.loc[new_df.index, new_df.columns] # By left shifting, The os values are shifted to card column

Unnamed: 0,camera,card,os
98,50 MP Quad Rear & 16 MP Front Camera,Android v12,
159,50 MP + 48 MP + 32 MP Triple Rear & 32 MP + 32...,Memory Card Not Supported,
236,50 MP + 13 MP + 8 MP Triple Rear & 20 MP Front...,Android v12,
306,12 MP + 12 MP Dual Rear & 10 MP Front Camera,Memory Card Not Supported,
321,50 MP + 12 MP + 10 MP Triple Rear & 10 MP + 4 ...,Android v12,
322,64 MP + 16 MP + 8 MP Triple Rear & 32 MP Front...,"Memory Card Supported, upto 256 GB",
365,50 MP + 8 MP Dual Rear & 32 MP Front Camera,Memory Card Not Supported,
392,50 MP + 8 MP Dual Rear & 32 MP Front Camera,Memory Card Not Supported,
482,50 MP Quad Rear & 16 MP + 16 MP Dual Front Camera,Android v12,
504,12 MP + 12 MP Dual Rear & 10 MP Front Camera,Memory Card Not Supported,


In [21]:
df['card'].value_counts()

card
Memory Card Supported, upto 1 TB                         171
Memory Card Not Supported                                129
Android v12                                              114
Memory Card Supported, upto 512 GB                       105
Memory Card (Hybrid), upto 1 TB                           91
Memory Card Supported, upto 256 GB                        89
Memory Card Supported                                     89
Android v13                                               46
Android v11                                               44
Memory Card (Hybrid)                                      31
Memory Card (Hybrid), upto 256 GB                         15
Android v10                                               12
Memory Card (Hybrid), upto 512 GB                         11
Memory Card Supported, upto 128 GB                         7
Memory Card Supported, upto 2 TB                           5
Memory Card Supported, upto 32 GB                          4
Memory Card (Hybrid

In [22]:
df.loc[df['card'].str.contains("MP"), 'camera'] = df.loc[df['card'].str.contains("MP"), 'card']
df.loc[df['card'].str.contains("MP"), 'card'] = np.NaN

In [23]:
df['card'] = df['card'].fillna("Undefined")

In [24]:
card_error_df = df.loc[~df['card'].str.contains("Memory"), ['card', 'os']]
df.loc[card_error_df.index, card_error_df.columns] = card_error_df.shift(1, axis = 1)

In [25]:
df.loc[card_error_df.index, card_error_df.columns]

Unnamed: 0,card,os
8,,Android v12
9,,Android v12
12,,Android v12
17,,Android v13
18,,Android v12
...,...,...
1011,,Android v12
1012,,Android v11
1013,,Android v13
1014,,Android v12


In [26]:
df['card'].value_counts()

card
Memory Card Supported, upto 1 TB       171
Memory Card Not Supported              129
Memory Card Supported, upto 512 GB     105
Memory Card (Hybrid), upto 1 TB         91
Memory Card Supported                   89
Memory Card Supported, upto 256 GB      89
Memory Card (Hybrid)                    31
Memory Card (Hybrid), upto 256 GB       15
Memory Card (Hybrid), upto 512 GB       11
Memory Card Supported, upto 128 GB       7
Memory Card Supported, upto 2 TB         5
Memory Card Supported, upto 32 GB        4
Memory Card (Hybrid), upto 128 GB        3
Memory Card (Hybrid), upto 64 GB         3
Memory Card (Hybrid), upto 2 TB          3
Memory Card Supported, upto 1000 GB      1
Name: count, dtype: int64

In [27]:
df['os'].value_counts().index

Index(['Android v12', 'Android v11', 'Android v13', 'Android v10',
       'Android v9.0 (Pie)', 'Android v10.0', 'iOS v16', 'iOS v15',
       'Android v8.1 (Oreo)', 'iOS v14', 'iOS v13', 'Android v8.0 (Oreo)',
       'Android v11.0', 'iOS v15.0', 'Harmony v2.0',
       'Android v6.0 (Marshmallow)', 'Android v5.1.1 (Lollipop)',
       'HarmonyOS v2.0', 'iOS v14.0', 'EMUI v12', 'Android v12.1',
       'RTOS (Series 30+)', 'Hongmeng OS v4.0', 'Android v7.1 (Nougat)',
       'Android', 'HarmonyOS', 'Android v4.4.2 (KitKat)', 'iOS v10',
       'Pragati OS (Powered by Android)', 'Bluetooth', 'Hongmeng OS v3.0',
       'Android v5.1 (Lollipop)', 'iOS v17', 'iOS v12.3', 'iOS v13.0',
       'HarmonyOS v2', 'Undefined', 'Android v9 (Pie)'],
      dtype='object', name='os')

In [28]:
df.loc[df['os'] == "Bluetooth", 'os'] = None

In [29]:
df.columns

Index(['index', 'model', 'price', 'rating', 'sim', 'processor', 'ram',
       'battery', 'display', 'camera', 'card', 'os'],
      dtype='object')

In [30]:
df['os'].value_counts()

os
Android v12                        393
Android v11                        274
Android v13                         91
Android v10                         69
Android v9.0 (Pie)                  29
Android v10.0                       23
iOS v16                             15
iOS v15                             12
Android v8.1 (Oreo)                 10
iOS v14                              6
Android v11.0                        4
iOS v13                              4
Android v8.0 (Oreo)                  4
iOS v15.0                            3
Android v5.1.1 (Lollipop)            2
Harmony v2.0                         2
HarmonyOS v2.0                       2
Android v6.0 (Marshmallow)           2
iOS v14.0                            2
EMUI v12                             2
Android v12.1                        2
RTOS (Series 30+)                    1
Hongmeng OS v4.0                     1
Android v7.1 (Nougat)                1
Android                              1
HarmonyOS             

In [31]:
# Resolving Tidiness Issues
# Creating a Brand Name from Model Column
df['brand'] = df['model'].str.split().apply(lambda x: x[0].lower())

In [32]:
df['brand'].unique()

array(['oneplus', 'samsung', 'motorola', 'realme', 'apple', 'xiaomi',
       'nothing', 'oppo', 'vivo', 'poco', 'iqoo', 'jio', 'gionee',
       'tecno', 'tesla', 'google', 'infinix', 'cola', 'letv', 'ikall',
       'leeco', 'duoqin', 'nokia', 'lava', 'honor', 'nubia', 'redmi',
       'asus', 'itel', 'royole', 'sony', 'oukitel', 'vertu', 'blu', 'lyf',
       'huawei', 'zte', 'lenovo', 'lg', 'micromax', 'leitz', 'cat',
       'doogee', 'tcl', 'sharp', 'blackview'], dtype=object)

In [33]:
# Handling sim column -> Creating 3 new columns (has_5G, has_IR_blasters, has_NFC) because those are the differentiate factors for the price of smartphones
df['has_5G'] = df['sim'].str.lower().str.contains("5g")
df['has_IR_blasters'] = df['sim'].str.lower().str.contains("ir blaster")
df['has_NFC'] = df['sim'].str.lower().str.contains("nfc")

# removing sim column
df = df.drop(columns = ['sim'])

In [34]:
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Display full column width

In [35]:
# Handling processor column
null_processor_brand_df = df[df['processor'].str.strip().str.split(", ").str.get(0).str.contains('Core')]
df["processor_name"] = df['processor'].str.split(", ").str.get(0).str.split().str.get(0)
df.loc[df["index"].isin(null_processor_brand_df['index'].to_list()), 'processor_name'] = None

In [36]:
def encodeCores(core: str) -> int:
    if "Octa Core" in core or "Octa Core Processor" in core:
        return 8
    elif "Hexa Core" in core or "Hexa Core Processor" in core:
        return 6
    elif "Quad Core" in core:
        return 4
    else:
        return None  # Null Value

df['no_of_cores'] = df['processor'].map(encodeCores)

In [37]:
import re
def find_strength(processor: str) -> list:
    pattern = r'\b\d+(\.\d+)?\s*GHz\b'
    matches = re.findall(pattern, processor, re.IGNORECASE)
    complete_matches = [match.group() for match in re.finditer(pattern, processor, re.IGNORECASE)]
    try:
        return complete_matches[0]
    except IndexError:
        return None

df['processor_strength'] = df['processor'].map(find_strength)

In [None]:
df.loc[856, 'processor_name'] = "Mediatek MT6739" # Filling Missing Data -> (28 nm) to Mediatek MT6739

df = df.drop(columns = ['processor'])

In [49]:
# Handling RAM Column
unavailable_ram_data_df = df[df['ram'].str.split(", ").str.get(1).isnull()]
df['RAM'] = df['ram'].str.split(", ").str.get(0)
df.loc[unavailable_ram_data_df.index, 'RAM'] = None

In [None]:
df['internal_storage'] = df['ram'].str.split(", ").str.get(1)
df.loc[unavailable_ram_data_df.index, 'internal_storage'] = unavailable_ram_data_df['ram']

df = df.drop(columns = ['ram'])

In [52]:
df['internal_storage']

0       256 GB inbuilt
1       128 GB inbuilt
2        64 GB inbuilt
3       128 GB inbuilt
4       128 GB inbuilt
5       128 GB inbuilt
6       128 GB inbuilt
7       256 GB inbuilt
8       128 GB inbuilt
9       128 GB inbuilt
10      128 GB inbuilt
11      128 GB inbuilt
12      128 GB inbuilt
13      128 GB inbuilt
14      256 GB inbuilt
15      128 GB inbuilt
16       32 GB inbuilt
17      256 GB inbuilt
18      128 GB inbuilt
19       64 GB inbuilt
20      128 GB inbuilt
21      128 GB inbuilt
22       64 GB inbuilt
23      128 GB inbuilt
24      256 GB inbuilt
25      128 GB inbuilt
26      128 GB inbuilt
27      128 GB inbuilt
28      128 GB inbuilt
29      128 GB inbuilt
30      128 GB inbuilt
31      128 GB inbuilt
32      256 GB inbuilt
33       64 GB inbuilt
34      128 GB inbuilt
35      128 GB inbuilt
36      256 GB inbuilt
37      256 GB inbuilt
38      128 GB inbuilt
39      256 GB inbuilt
40      128 GB inbuilt
41      128 GB inbuilt
42      128 GB inbuilt
43      256

In [63]:
# Handling Battery Column
df['battery_capacity'] = df['battery'].str.strip().str.split(" with ").str.get(0).str.findall(r'\b(\d+)\b').str.get(0)

In [67]:
df.iloc[3, :]

index                                                                           5
model                                                        Motorola Moto G62 5G
price                                                                       14999
rating                                                                       81.0
battery                                       5000 mAh Battery with Fast Charging
display               6.55 inches, 1080 x 2400 px, 120 Hz Display with Punch Hole
camera                       50 MP + 8 MP + 2 MP Triple Rear & 16 MP Front Camera
card                                              Memory Card (Hybrid), upto 1 TB
os                                                                    Android v12
brand                                                                    motorola
has_5G                                                                       True
has_IR_blasters                                                             False
has_NFC         

In [71]:
df['charger_power'] = df['battery'].str.strip().str.split(" with ").str.get(1).str.split("W").str.get(0)
df.loc[df['charger_power'] == "Fast Charging", 'charger_power'] = None

In [73]:
df['charger_power'].unique()
df = df.drop(columns = ['battery'])

In [79]:
# Handling display column
df['screen_size'] = df['display'].str.strip().str.split(", ").str.get(0)

In [85]:
df['screen_resolution'] = df['display'].str.extract(r'(\d+ x \d+ px)')

In [89]:
df['refresh_rate'] = df['display'].str.extract(r'(\d+\s*Hz)')

In [92]:
df = df.drop(columns = ['display'])

In [111]:
# Handling camera feature -> No. front cameras and No. rear cameras
def camera_extractor(text):
  if 'Quad' in text:
    return '4'
  elif 'Triple' in text:
    return '3'
  elif 'Dual' in text:
    return '2'
  elif 'Missing' in text:
    return 'Missing'
  else:
    return '1'

# Number of Rear cameras
df['total_rear_cameras'] = df['camera'].str.split("&").str.get(0).apply(camera_extractor)

# Highest MP Rear camera
df['rear_max_camera'] = df['camera'].str.strip().str.findall(r'(\d+\s*)').str.get(0).astype(np.int32)

In [None]:
# Number of Front cameras
df['total_front_cameras'] = df['camera'].str.split("&").str.get(1).fillna("").apply(camera_extractor)

# Front max Camera
df['front_max_camera'] = df['camera'].str.split("&").str.get(1).fillna("").str.findall(r'(\d+\s*)').str.get(0).fillna(0).astype(np.int32)

df = df.drop(columns = ['camera'])

In [None]:
# Handling memory card column
df['supports_memory_card'] = df['card'].apply(lambda x: False if x == None or "Not" in x else True)

In [131]:
# Handling os column
def os_extractor(text):
    if 'Android' in text:
        return 'android'
    elif 'iOS' in text:
        return 'ios'
    elif 'Not Specified':
        return text
    elif 'Harmony' in text or 'Hongmeng' in text or 'EMUI' in text:
        return 'other'
    else:
        return 'other'
  

df['os'] = df['os'].fillna('other').apply(os_extractor)

In [133]:
df = df.drop(columns = ['index', 'model', 'card'])
df.head()

Unnamed: 0,price,rating,os,brand,has_5G,has_IR_blasters,has_NFC,processor_name,no_of_cores,processor_strength,RAM,internal_storage,battery_capacity,charger_power,screen_size,screen_resolution,refresh_rate,rear_max_camera,total_rear_cameras,total_front_cameras,front_max_camera,supports_memory_card
0,54999,89.0,android,oneplus,True,False,True,Snapdragon,8.0,3.2 GHz,12 GB RAM,256 GB inbuilt,5000,100.0,6.7 inches,1440 x 3216 px,120 Hz,50,3,1,16,False
1,19989,81.0,android,oneplus,True,False,False,Snapdragon,8.0,2.2 GHz,6 GB RAM,128 GB inbuilt,5000,33.0,6.59 inches,1080 x 2412 px,120 Hz,64,3,1,16,True
2,16499,75.0,android,samsung,True,False,False,Exynos,8.0,2.4 GHz,4 GB RAM,64 GB inbuilt,5000,15.0,6.6 inches,1080 x 2408 px,90 Hz,50,3,1,13,True
3,14999,81.0,android,motorola,True,False,False,Snapdragon,8.0,2.2 GHz,6 GB RAM,128 GB inbuilt,5000,,6.55 inches,1080 x 2400 px,120 Hz,50,3,1,16,True
4,24999,82.0,android,realme,True,False,False,Dimensity,8.0,2.6 GHz,6 GB RAM,128 GB inbuilt,5000,67.0,6.7 inches,1080 x 2412 px,120 Hz,108,3,1,16,False


In [137]:
df = df.reindex(
    columns = ['brand', 'os', 'rating', 'has_5G', 'has_IR_blasters', 'has_NFC', 'has_5G', 'has_IR_blasters', 'has_NFC', 'RAM', 'internal_storage', 'battery_capacity', 'charger_power', 'screen_size', 'screen_resolution', 'refresh_rate', 'rear_max_camera', 'total_rear_cameras', 'total_front_cameras', 'front_max_camera', 'supports_memory_card' ,'price']
).reset_index(drop = True)

In [138]:
df.to_csv('smartphone_cleaned.csv',index = False)