In [14]:
#import required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [15]:
#obtain the dataset from the url below
#the dataset is in csv format
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_base.csv"
df = pd.read_csv(url, header=None)

In [16]:
#add headers to the dataframe
header = ["Manufacturer", "Category", "Screen", "GPU", "OS", "CPU_core", "Screen_Size_cm", "CPU_frequency", "RAM_GB", "Storage_GB_SSD", "Weight_kg", "Price"]
df.columns = header
df.head(10)

Unnamed: 0,Manufacturer,Category,Screen,GPU,OS,CPU_core,Screen_Size_cm,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_kg,Price
0,Acer,4,IPS Panel,2,1,5,35.56,1.6,8,256,1.6,978
1,Dell,3,Full HD,1,1,3,39.624,2.0,4,256,2.2,634
2,Dell,3,Full HD,1,1,7,39.624,2.7,8,256,2.2,946
3,Dell,4,IPS Panel,2,1,5,33.782,1.6,8,128,1.22,1244
4,HP,4,Full HD,2,1,7,39.624,1.8,8,256,1.91,837
5,Dell,3,Full HD,1,1,5,39.624,1.6,8,256,2.2,1016
6,HP,3,Full HD,3,1,5,39.624,1.6,8,256,2.1,1117
7,Acer,3,IPS Panel,2,1,5,38.1,1.6,4,256,2.2,866
8,Dell,3,Full HD,1,1,5,39.624,2.5,4,256,2.3,812
9,Acer,3,IPS Panel,3,1,7,38.1,1.8,8,256,2.2,1068


In [17]:
#replace ? symbol with NaN value so that dropna() can remove missing values
df.replace('?', np.nan, inplace=True)
df.head(10)

Unnamed: 0,Manufacturer,Category,Screen,GPU,OS,CPU_core,Screen_Size_cm,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_kg,Price
0,Acer,4,IPS Panel,2,1,5,35.56,1.6,8,256,1.6,978
1,Dell,3,Full HD,1,1,3,39.624,2.0,4,256,2.2,634
2,Dell,3,Full HD,1,1,7,39.624,2.7,8,256,2.2,946
3,Dell,4,IPS Panel,2,1,5,33.782,1.6,8,128,1.22,1244
4,HP,4,Full HD,2,1,7,39.624,1.8,8,256,1.91,837
5,Dell,3,Full HD,1,1,5,39.624,1.6,8,256,2.2,1016
6,HP,3,Full HD,3,1,5,39.624,1.6,8,256,2.1,1117
7,Acer,3,IPS Panel,2,1,5,38.1,1.6,4,256,2.2,866
8,Dell,3,Full HD,1,1,5,39.624,2.5,4,256,2.3,812
9,Acer,3,IPS Panel,3,1,7,38.1,1.8,8,256,2.2,1068


In [21]:
#check the data for missing values
missing_data = df.isnull()
for col in missing_data.columns:
  print(missing_data[col].value_counts())
  print(" ")

Manufacturer
False    238
Name: count, dtype: int64
 
Category
False    238
Name: count, dtype: int64
 
Screen
False    238
Name: count, dtype: int64
 
GPU
False    238
Name: count, dtype: int64
 
OS
False    238
Name: count, dtype: int64
 
CPU_core
False    238
Name: count, dtype: int64
 
Screen_Size_cm
False    234
True       4
Name: count, dtype: int64
 
CPU_frequency
False    238
Name: count, dtype: int64
 
RAM_GB
False    238
Name: count, dtype: int64
 
Storage_GB_SSD
False    238
Name: count, dtype: int64
 
Weight_kg
False    233
True       5
Name: count, dtype: int64
 
Price
False    238
Name: count, dtype: int64
 


From the summary, we can see that two columns have missing data:
- Screen_Size_cm: 4
- Weight_kg: 5

Since the column "Screen_Size_cm" contains categorical data, we will replace the missing values with the most frequent value.

In [26]:
#check for the most frequent value in the column "Screen_Size_cm"
freq_screen_size = df['Screen_Size_cm'].value_counts().idxmax()
#replace missing data with the most frequent screen size
df['Screen_Size_cm'].replace(np.nan, freq_screen_size, inplace=True)

The "Weight_kg" column contains continuous data. We will replace missing data with the average of the values.

In [30]:
#calculate the average of the column "Weight_kg"
avg_weight = df['Weight_kg'].astype('float').mean(axis=0)
#replace missing data with the average weight
df['Weight_kg'].replace(np.nan, avg_weight, inplace=True)

In [31]:
#check the data for missing values after handling missing data
missing_data = df.isnull()
for col in missing_data.columns:
  print(missing_data[col].value_counts())
  print(" ")

Manufacturer
False    238
Name: count, dtype: int64
 
Category
False    238
Name: count, dtype: int64
 
Screen
False    238
Name: count, dtype: int64
 
GPU
False    238
Name: count, dtype: int64
 
OS
False    238
Name: count, dtype: int64
 
CPU_core
False    238
Name: count, dtype: int64
 
Screen_Size_cm
False    238
Name: count, dtype: int64
 
CPU_frequency
False    238
Name: count, dtype: int64
 
RAM_GB
False    238
Name: count, dtype: int64
 
Storage_GB_SSD
False    238
Name: count, dtype: int64
 
Weight_kg
False    238
Name: count, dtype: int64
 
Price
False    238
Name: count, dtype: int64
 


We can see that our dataset doesn't contain missing data now.

###**Fixing the data types**

In [32]:
#check the data types
df.dtypes

Unnamed: 0,0
Manufacturer,object
Category,int64
Screen,object
GPU,int64
OS,int64
CPU_core,int64
Screen_Size_cm,object
CPU_frequency,float64
RAM_GB,int64
Storage_GB_SSD,int64


Both "Weight_kg" and "Screen_Size_cm" are seen to have the data type "Object", while both of them should be having a data type of "float". Let's fix the data type of these two columns.

In [33]:
#convert data types to float
df['Weight_kg'] = df['Weight_kg'].astype('float')
df['Screen_Size_cm'] = df['Screen_Size_cm'].astype('float')

In [34]:
#check the data types after convertions
df.dtypes

Unnamed: 0,0
Manufacturer,object
Category,int64
Screen,object
GPU,int64
OS,int64
CPU_core,int64
Screen_Size_cm,float64
CPU_frequency,float64
RAM_GB,int64
Storage_GB_SSD,int64


In [None]:
df.to_csv('laptops.csv', index=False)