In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
data_path = '../data/smartwatches.csv'
df = pd.read_csv(data_path)
df.drop_duplicates(inplace=True)
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [4]:
df.shape, train_df.shape, test_df.shape

((450, 16), (360, 17), (90, 17))

In [5]:
train_df.head()

Unnamed: 0.1,index,Unnamed: 0,Brand,Current Price,Original Price,Discount Percentage,Rating,Number OF Ratings,Model Name,Dial Shape,Strap Color,Strap Material,Touchscreen,Battery Life (Days),Bluetooth,Display Size,Weight
0,407,407,zebronics,2949.0,4199.0,29.768993,3.1,10.0,LEATHER fit-650,Oval,Black,Leather,Yes,,Yes,2.7 inches,
1,444,444,fire-boltt,6999.0,15999.0,56.253516,2.6,,bsw020,Circle,Brown,Silicon,Yes,8.0,Yes,1.4 inches,
2,117,117,fire-boltt,2499.0,11999.0,79.173264,4.1,1990.0,BSW070,,,,Yes,8.0,Yes,1.9 inches,75g +
3,30,30,boat,1999.0,7990.0,74.981227,3.6,827.0,,,,,,8.0,Yes,,75g +
4,415,415,fire-boltt,2299.0,5999.0,61.676946,2.5,24.0,NINJA PRO MAX,Square,Blue,Silicon,Yes,22.0,Yes,1.6 inches,


In [7]:
train_df.drop(['index', 'Unnamed: 0'], axis=1, inplace=True)

In [8]:
train_df.head()

Unnamed: 0,Brand,Current Price,Original Price,Discount Percentage,Rating,Number OF Ratings,Model Name,Dial Shape,Strap Color,Strap Material,Touchscreen,Battery Life (Days),Bluetooth,Display Size,Weight
0,zebronics,2949.0,4199.0,29.768993,3.1,10.0,LEATHER fit-650,Oval,Black,Leather,Yes,,Yes,2.7 inches,
1,fire-boltt,6999.0,15999.0,56.253516,2.6,,bsw020,Circle,Brown,Silicon,Yes,8.0,Yes,1.4 inches,
2,fire-boltt,2499.0,11999.0,79.173264,4.1,1990.0,BSW070,,,,Yes,8.0,Yes,1.9 inches,75g +
3,boat,1999.0,7990.0,74.981227,3.6,827.0,,,,,,8.0,Yes,,75g +
4,fire-boltt,2299.0,5999.0,61.676946,2.5,24.0,NINJA PRO MAX,Square,Blue,Silicon,Yes,22.0,Yes,1.6 inches,


In [9]:
train_df['Display Size'].isna().sum()

27

In [11]:
train_df['Display Size'].value_counts().count()

34

In [12]:
train_df['Display Size'].fillna('0.0 inches', inplace=True)

In [13]:
train_df['Display Size'].isna().sum()

0

In [16]:
train_df['Display Size'] = train_df['Display Size'].apply(lambda x: float(x.split()[0]))

In [17]:
train_df.head()

Unnamed: 0,Brand,Current Price,Original Price,Discount Percentage,Rating,Number OF Ratings,Model Name,Dial Shape,Strap Color,Strap Material,Touchscreen,Battery Life (Days),Bluetooth,Display Size,Weight
0,zebronics,2949.0,4199.0,29.768993,3.1,10.0,LEATHER fit-650,Oval,Black,Leather,Yes,,Yes,2.7,
1,fire-boltt,6999.0,15999.0,56.253516,2.6,,bsw020,Circle,Brown,Silicon,Yes,8.0,Yes,1.4,
2,fire-boltt,2499.0,11999.0,79.173264,4.1,1990.0,BSW070,,,,Yes,8.0,Yes,1.9,75g +
3,boat,1999.0,7990.0,74.981227,3.6,827.0,,,,,,8.0,Yes,0.0,75g +
4,fire-boltt,2299.0,5999.0,61.676946,2.5,24.0,NINJA PRO MAX,Square,Blue,Silicon,Yes,22.0,Yes,1.6,


In [18]:
train_df['Display Size'].replace(0.0, np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Display Size'].replace(0.0, np.nan, inplace=True)


In [19]:
train_df['Display Size'].isna().sum()

28

In [20]:
train_df['Weight'].head()

0      NaN
1      NaN
2    75g +
3    75g +
4      NaN
Name: Weight, dtype: object

In [21]:
train_df['Weight'].value_counts()

Weight
20 - 35 g    63
75g +        58
35 - 50 g    45
<= 20 g      30
50 - 75 g    15
Name: count, dtype: int64

In [35]:
def cal(format):
    cal = sum(int(x) for x in re.findall('\d+', format)) / 2
    train_df['Weight'].replace(format, cal, inplace=True)
    


In [36]:
cal('20 - 35 g')
cal('35 - 50 g')
cal('50 - 75 g')

In [39]:
train_df['Weight'].replace('75g +', float(re.findall('\d+', '75g +')[0]), inplace=True)

In [42]:
train_df['Weight'].replace('<= 20 g', float(re.findall('\d+', '<= 20 g')[0]), inplace=True)

In [43]:
train_df['Weight'].value_counts()

Weight
27.5    63
75.0    58
42.5    45
20.0    30
62.5    15
Name: count, dtype: int64

In [44]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Brand                360 non-null    object 
 1   Current Price        354 non-null    float64
 2   Original Price       304 non-null    float64
 3   Discount Percentage  304 non-null    float64
 4   Rating               356 non-null    float64
 5   Number OF Ratings    315 non-null    float64
 6   Model Name           330 non-null    object 
 7   Dial Shape           260 non-null    object 
 8   Strap Color          260 non-null    object 
 9   Strap Material       304 non-null    object 
 10  Touchscreen          329 non-null    object 
 11  Battery Life (Days)  330 non-null    float64
 12  Bluetooth            355 non-null    object 
 13  Display Size         332 non-null    float64
 14  Weight               211 non-null    float64
dtypes: float64(8), object(7)
memory usage: 4

In [45]:
train_df.head()

Unnamed: 0,Brand,Current Price,Original Price,Discount Percentage,Rating,Number OF Ratings,Model Name,Dial Shape,Strap Color,Strap Material,Touchscreen,Battery Life (Days),Bluetooth,Display Size,Weight
0,zebronics,2949.0,4199.0,29.768993,3.1,10.0,LEATHER fit-650,Oval,Black,Leather,Yes,,Yes,2.7,
1,fire-boltt,6999.0,15999.0,56.253516,2.6,,bsw020,Circle,Brown,Silicon,Yes,8.0,Yes,1.4,
2,fire-boltt,2499.0,11999.0,79.173264,4.1,1990.0,BSW070,,,,Yes,8.0,Yes,1.9,75.0
3,boat,1999.0,7990.0,74.981227,3.6,827.0,,,,,,8.0,Yes,,75.0
4,fire-boltt,2299.0,5999.0,61.676946,2.5,24.0,NINJA PRO MAX,Square,Blue,Silicon,Yes,22.0,Yes,1.6,
