# Filtering And Aggregating Raw Data

## Importing Requisite Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

## List Of The Features To Be Extracted From The Raw Data

- laptop_company = []
- processor_company = []
- processor = []
- operating_system = []
- RAM = []
- storage = []
- storage_type = [] # If not SSD Default will be HDD
- rating = []
- No_reviews = []
- screen_size = []
- price = [] # Target column

## Filtering Features From Raw Data

In [2]:
# Loading the raw CSV
df = pd.read_csv(r"data\raw.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Raw Data
0,0,"<!doctype html><html lang=""en""><head><link hre..."
1,1,"<!doctype html><html lang=""en""><head><link hre..."
2,2,"<!doctype html><html lang=""en""><head><link hre..."
3,3,"<!doctype html><html lang=""en""><head><link hre..."
4,4,"<!doctype html><html lang=""en""><head><link hre..."


In [3]:
pages = []
for i,j in df.iterrows():
    pages.append(j["Raw Data"])

In [4]:
# Iterating over all pages and for each page filtering out the laptop brand for each product
laptop_brand = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="tUxRFH"):
        regex = re.findall("Compare(\w+)",i.text)
        if regex:
            laptop_brand.append(regex[0])
        else:
            laptop_brand.append(np.nan)

In [5]:
len(laptop_brand)

1632

In [6]:
# Filtering out laptop names
laptop_name = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="KzDlHZ"):
        regex = re.findall("(^.+)\s(?:Intel|intel|AMD|M1|M2|M3|Chromebook|Snapdragon)",i.text)
        if regex:
            laptop_name.append(regex[0])
        else:
            laptop_name.append(np.nan)

In [7]:
len(laptop_name)

1632

In [8]:
processor = []
processor_company = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="KzDlHZ"):        
        
        # Regex to find the processor company of the laptop
        regex1 = re.findall("Intel|intel|AMD|M1|M2|M3|Chromebook|Snapdragon",str(i.text))
        if regex1:
            processor_company.append(regex1[0])
        else:
            processor_company.append(np.nan)

        # Regex to find the exact processor in the laptop
        regex2 = re.findall("(?:Intel|intel|AMD|M1|M2|M3|Chromebook|Snapdragon)\s(.+) - ",str(i.text))
        if regex2:
            processor.append(regex2[0])
        else:
            processor.append(np.nan)

In [9]:
# Checking the number of datapoints after applying the regex
print(len(processor_company))

1632


In [10]:
print(len(processor))

1632


In [11]:
# Filtering out operating System of the laptops
operating_system = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="KzDlHZ"):

        # Regex to find Operating Systems
        regex = re.findall(".+((?:Windows 10|Mac OS|DOS|Andorid|Chrome|Windows 11|Windows 11 Home))",str(i.text))

        if regex:
            operating_system.append(regex[0])
        else:
            operating_system.append(np.nan)

In [12]:
# Checking the number of datapoints
len(operating_system)

1632

In [13]:
# Filtering out RAM of the laptops
RAM = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="KzDlHZ"):

        # Regex to find RAM
        regex = re.findall("(\d+)\sGB\/",str(i.text))

        if regex:
            RAM.append(regex[0])
        else:
            RAM.append(np.nan)

In [14]:
len(RAM)

1632

In [15]:
# Filtering out Storage of the laptops
storage = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="KzDlHZ"):

        # Regex to find Storage Size
        regex = re.findall("\d+\sGB\/(\d+)\s(?:GB|TB)\s(?:SSD|HDD|EMMC)",str(i.text))

        if regex:
            storage.append(regex[0])
        else:
            storage.append(np.nan)

In [16]:
len(storage)

1632

In [17]:
# Filtering out Storage type of the laptops
storage_type = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="KzDlHZ"):

        # Regex to find Storage type
        regex = re.findall("\d+\sGB\/\d+\s(?:GB|TB)\s((?:SSD|HDD|EMMC))",str(i.text))

        if regex:
            storage_type.append(regex[0])
        else:
            storage_type.append(np.nan)

In [18]:
len(storage_type)

1632

In [19]:
# Filtering out Rating of the laptops
rating = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="tUxRFH"):

        r = i.find("div",class_="XQDdHH")

        if r:
            rating.append(r.text)
        else:
            rating.append(np.nan)

In [20]:
len(rating)

1632

In [21]:
# Filtering out number of reviews for each laptops
No_reviews = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="tUxRFH"):
        p =i.find("span",class_="Wphh3N")
        if p:
            regex = re.findall("\&\s(.+)\sReviews",p.text)
            if regex:
                No_reviews.append(regex[0])
            else:
                No_reviews.append(np.nan)
        else:
            No_reviews.append(np.nan)

In [22]:
len(No_reviews)

1632

In [23]:
# Filtering out Screen Size of the laptops
screen_size = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="_6NESgJ"):
        regex = re.findall("\d+?\.?\d+?",i.text)
        if regex:
            screen_size.append(regex[0])
        else:
            screen_size.append(np.nan)

In [24]:
len(screen_size)

1632

In [25]:
# Filtering out the target column: price of the laptop
price = []
for page in pages:
    soup = BeautifulSoup(page)
    for i in soup.find_all("div",class_="tUxRFH"):
        
        p =i.find("div",class_="Nx9bqj _4b5DiR")

        if p:
            price.append(p.text)
        else:
            price.append(np.nan)

In [26]:
len(price)

1632

In [27]:
# Creating a dataframe from all the extracted features present in list
feature_dict = {"Laptop_Brand":laptop_brand,
                "Laptop_Name":laptop_name,
                "Processor_Company":processor_company,
                "Processor":processor,
                "Operating_System":operating_system,
                "RAM":RAM,
                "Storage":storage,
                "Storage_Type":storage_type,
                "Screen_Size":screen_size,
                "Rating":rating,
                "Number_of_Reviews": No_reviews,
                "Price":price}

In [28]:
# Creating a dataframe from the above dictionary
laptop_df = pd.DataFrame(feature_dict)

In [29]:
# Saving the dataframe as a csv file for further analysis
laptop_df.to_csv(r"data\flipkart_laptop_data.csv",index=False)

# Cleaning The Aggregated Data

## Loading the Necessary Modules

In [30]:
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

## Loading the CSV file

In [31]:
laptop_df = pd.read_csv(r"data\flipkart_laptop_data.csv")
laptop_df.head()

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Processor,Operating_System,RAM,Storage,Storage_Type,Screen_Size,Rating,Number_of_Reviews,Price
0,HP,HP Victus,Intel,Core i5 12th Gen,Windows 11,16,512,SSD,12,4.4,38.0,"₹58,990"
1,MSI,MSI Thin 15,Intel,Core i5 12th Gen 12450H,Windows 11,16,512,SSD,12,4.3,34.0,"₹57,990"
2,HP,HP Laptop,AMD,Ryzen 3 Quad Core 5300U,Windows 11,8,512,SSD,11,4.3,482.0,"₹30,999"
3,Acer,Acer One,Intel,Core i3 11th Gen 1115G4,Windows 11,8,512,SSD,11,4.2,571.0,"₹26,990"
4,HP,HP,AMD,Ryzen 5 Hexa Core 5500U,Windows 11,16,512,SSD,16,4.3,268.0,"₹42,990"


## Cleaning Data

In [32]:
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632 entries, 0 to 1631
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Laptop_Brand       1632 non-null   object 
 1   Laptop_Name        1632 non-null   object 
 2   Processor_Company  1632 non-null   object 
 3   Processor          1632 non-null   object 
 4   Operating_System   1632 non-null   object 
 5   RAM                1632 non-null   int64  
 6   Storage            1632 non-null   int64  
 7   Storage_Type       1632 non-null   object 
 8   Screen_Size        1632 non-null   int64  
 9   Rating             1503 non-null   float64
 10  Number_of_Reviews  1503 non-null   float64
 11  Price              1623 non-null   object 
dtypes: float64(2), int64(3), object(7)
memory usage: 153.1+ KB


### Observation
- There are a total of `1632` datapoints but looking at the columns `Rating` and `Number_of_Reviews`, there are some null values which needs to be dealt with later.
- Need to check for wrong values or outliers in the data.
- The target column `Price` should be integer but is stored as an object so it must be converted to right datatype as well as the missing data needs to be replaced.

In [33]:
# Inspecting Price Column
laptop_df["Price"].head(10)

0    ₹58,990
1    ₹57,990
2    ₹30,999
3    ₹26,990
4    ₹42,990
5    ₹64,990
6    ₹52,990
7    ₹35,990
8    ₹20,990
9    ₹36,990
Name: Price, dtype: object

- Based on the above cell output, we see that Price is being treated as object column becuase of an extra symbol `₹` and `,`.
- Therefore they need to be removed as they dont contribute in the analysis.

In [34]:
# Removing extra characters from the price column
laptop_df["Price"] = laptop_df["Price"].str.replace(',','').str.replace('₹', '')
laptop_df.head()

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Processor,Operating_System,RAM,Storage,Storage_Type,Screen_Size,Rating,Number_of_Reviews,Price
0,HP,HP Victus,Intel,Core i5 12th Gen,Windows 11,16,512,SSD,12,4.4,38.0,58990
1,MSI,MSI Thin 15,Intel,Core i5 12th Gen 12450H,Windows 11,16,512,SSD,12,4.3,34.0,57990
2,HP,HP Laptop,AMD,Ryzen 3 Quad Core 5300U,Windows 11,8,512,SSD,11,4.3,482.0,30999
3,Acer,Acer One,Intel,Core i3 11th Gen 1115G4,Windows 11,8,512,SSD,11,4.2,571.0,26990
4,HP,HP,AMD,Ryzen 5 Hexa Core 5500U,Windows 11,16,512,SSD,16,4.3,268.0,42990


In [35]:
# Analysing the complete description summary of the dataframe
laptop_df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Laptop_Brand,1632.0,9.0,HP,388.0,,,,,,,
Laptop_Name,1632.0,49.0,HP,136.0,,,,,,,
Processor_Company,1632.0,4.0,Intel,1040.0,,,,,,,
Processor,1632.0,38.0,Core i3 12th Gen 1215U,182.0,,,,,,,
Operating_System,1632.0,2.0,Windows 11,1541.0,,,,,,,
RAM,1632.0,,,,12.252451,5.865025,4.0,8.0,8.0,16.0,32.0
Storage,1632.0,,,,440.907475,171.443332,1.0,512.0,512.0,512.0,512.0
Storage_Type,1632.0,2.0,SSD,1541.0,,,,,,,
Screen_Size,1632.0,,,,19.865809,18.555163,11.0,11.0,12.0,16.0,81.0
Rating,1503.0,,,,4.203127,0.238804,3.3,4.1,4.2,4.3,5.0


### Observation Based Tasks:
- Convert the price column to int after handling missing values.
- Convert the laptop names to proper product names
- In the storage column, there is a min vlaue of 1, which is measuring the data in TB, so all the values must be converted to a common GB measurement.
- Check the screen size for values and impute the outliers accordingly.
- In rating and no of reviews replace null values with 0.

In [36]:
# Analyzing the datapoints which have null price
laptop_df[laptop_df["Price"].isnull()]

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Processor,Operating_System,RAM,Storage,Storage_Type,Screen_Size,Rating,Number_of_Reviews,Price
25,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
145,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
217,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
241,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
265,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
641,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
1457,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
1553,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,
1577,Lenovo,Lenovo LOQ,Intel,Core i5 13th Gen 13450HX,Windows 11,16,512,SSD,13,4.2,44.0,


- All the above data points are duplicated, so will be dropped.
- The process of handling will be as follows:
    - Removing all the null values in Price column excpet for the 1 datapoint.
    - Replacing the null value in Price with the mean of 50th percentile and 75th percentile of all Lenovo Laptop Prices. 

In [37]:
# Step 1: Drop 8 rows where Laptop_Name is 'Lenovo LOQ' and Price is null
lenovo_loq_null_price = laptop_df[(laptop_df['Laptop_Name'] == 'Lenovo LOQ') & (laptop_df['Price'].isnull())]
laptop_df = laptop_df.drop(lenovo_loq_null_price.index[:8])

# Step 2: Calculate the mean of the 50th and 75th percentiles of the Price column for Lenovo laptops
lenovo_prices = laptop_df[laptop_df['Laptop_Brand'] == 'Lenovo']['Price'].dropna().astype(float)
percentile_50 = lenovo_prices.quantile(0.50)
percentile_75 = lenovo_prices.quantile(0.75)
mean_price = (percentile_50 + percentile_75) / 2

# Step 3: Replace the null value in the remaining 'Lenovo LOQ' row with the calculated mean price
laptop_df.loc[(laptop_df['Laptop_Name'] == 'Lenovo LOQ') & (laptop_df['Price'].isnull()), 'Price'] = mean_price

# Verifying the changes
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1624 entries, 0 to 1631
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Laptop_Brand       1624 non-null   object 
 1   Laptop_Name        1624 non-null   object 
 2   Processor_Company  1624 non-null   object 
 3   Processor          1624 non-null   object 
 4   Operating_System   1624 non-null   object 
 5   RAM                1624 non-null   int64  
 6   Storage            1624 non-null   int64  
 7   Storage_Type       1624 non-null   object 
 8   Screen_Size        1624 non-null   int64  
 9   Rating             1495 non-null   float64
 10  Number_of_Reviews  1495 non-null   float64
 11  Price              1624 non-null   object 
dtypes: float64(2), int64(3), object(7)
memory usage: 164.9+ KB


In [38]:
# Converting Price column to numbers
laptop_df["Price"] = laptop_df["Price"].astype("float")
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1624 entries, 0 to 1631
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Laptop_Brand       1624 non-null   object 
 1   Laptop_Name        1624 non-null   object 
 2   Processor_Company  1624 non-null   object 
 3   Processor          1624 non-null   object 
 4   Operating_System   1624 non-null   object 
 5   RAM                1624 non-null   int64  
 6   Storage            1624 non-null   int64  
 7   Storage_Type       1624 non-null   object 
 8   Screen_Size        1624 non-null   int64  
 9   Rating             1495 non-null   float64
 10  Number_of_Reviews  1495 non-null   float64
 11  Price              1624 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 164.9+ KB


In [39]:
# Removing the brand name from Laptop Name to just have product name
def remove_company(name):
    words = name.split(' ',1)
    
    if len(words)>1:
        # Removing anything which comes after , or -
        trunc = words[1].split(',')[0].split('-')[0].strip()
        return trunc if trunc else name
        
    return name

laptop_df["Laptop_Name"] = laptop_df["Laptop_Name"].apply(remove_company)

In [40]:
laptop_df.head(10)

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Processor,Operating_System,RAM,Storage,Storage_Type,Screen_Size,Rating,Number_of_Reviews,Price
0,HP,Victus,Intel,Core i5 12th Gen,Windows 11,16,512,SSD,12,4.4,38.0,58990.0
1,MSI,Thin 15,Intel,Core i5 12th Gen 12450H,Windows 11,16,512,SSD,12,4.3,34.0,57990.0
2,HP,Laptop,AMD,Ryzen 3 Quad Core 5300U,Windows 11,8,512,SSD,11,4.3,482.0,30999.0
3,Acer,One,Intel,Core i3 11th Gen 1115G4,Windows 11,8,512,SSD,11,4.2,571.0,26990.0
4,HP,HP,AMD,Ryzen 5 Hexa Core 5500U,Windows 11,16,512,SSD,16,4.3,268.0,42990.0
5,Infinix,GT Book,Intel,Core i5 12th Gen 12450H,Windows 11,16,512,SSD,12,4.4,13.0,64990.0
6,Acer,Aspire 7,Intel,Core i5 12th Gen 12450H,Windows 11,16,512,SSD,12,4.1,214.0,52990.0
7,ASUS,Vivobook 15,Intel,Core i3 12th Gen 1215U,Windows 11,8,512,SSD,12,4.2,360.0,35990.0
8,Acer,Aspire 3,Intel,Celeron Dual Core N4500,Windows 11,8,512,SSD,11,3.8,25.0,20990.0
9,MSI,Modern 14,AMD,Ryzen 5 Hexa Core 7530U,Windows 11,16,512,SSD,16,4.3,246.0,36990.0


In [41]:
# Hnadling the values of Storage column
laptop_df[["Storage"]].describe()

Unnamed: 0,Storage
count,1624.0
mean,440.557266
std,171.792497
min,1.0
25%,512.0
50%,512.0
75%,512.0
max,512.0


- The min value of a laptop storage can be 128 GB nothing less than that, if it is less than it means the value is TB and needs to be converted into GB or its a wrong data point.
- Lets filter out all the datapoints where the storage is less than 128 GB.

In [42]:
# Filtering out the datapoints where the storage is less than 128
filtered_df = laptop_df[laptop_df['Storage'] < 128]
filtered_df

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Processor,Operating_System,RAM,Storage,Storage_Type,Screen_Size,Rating,Number_of_Reviews,Price
17,ASUS,ROG Strix Scar 16,Intel,Core i9 14th Gen 14900HX,Windows 11,32,2,SSD,14,,,339990.0
18,HP,Chromebook MediaTek MT8183,Chromebook,MediaTek MT8183,Chrome,4,32,EMMC,81,3.8,501.0,11990.0
20,Acer,Predator Neo,Intel,Core i7 13th Gen 13700HX,Windows 11,16,1,SSD,13,4.4,89.0,104990.0
29,MSI,Claw AI PC,Intel,Core Ultra 7 155H,Windows 11,16,1,SSD,16,5.0,1.0,74990.0
41,Acer,Acer Predator Helios Neo 16,Intel,Core i9 13th Gen 13900HX,Windows 11,16,1,SSD,13,4.2,8.0,134990.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1601,Lenovo,Yoga Slim 7x Qualcomm,Snapdragon,X Elite,Windows 11,32,1,SSD,32,,,149990.0
1602,HP,Chromebook MediaTek MT8183,Chromebook,MediaTek MT8183,Chrome,4,32,EMMC,81,3.8,501.0,11990.0
1613,Lenovo,Yoga AI PC,Intel,Core Ultra 7 155H,Windows 11,32,1,SSD,32,,,244890.0
1625,Lenovo,Yoga Slim 7x Qualcomm,Snapdragon,X Elite,Windows 11,32,1,SSD,32,,,149990.0


### Observation
- EMMC Storage are exception to the traditional laptops as they aare made for extremely light weight load so whatever storage is provided need not to be changed.
- However the storage value in for SSDs/HDDs needs to be updated.

In [43]:
# Converting the Storage values of Storage in TB to GB
condition = (laptop_df['Storage_Type'].isin(['SSD', 'HDD'])) & (laptop_df['Storage'] < 128)
laptop_df.loc[condition, 'Storage'] *= 1024

In [44]:
# Looking at Data Sumamry
laptop_df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Laptop_Brand,1624.0,9.0,HP,388.0,,,,,,,
Laptop_Name,1624.0,47.0,HP,136.0,,,,,,,
Processor_Company,1624.0,4.0,Intel,1032.0,,,,,,,
Processor,1624.0,38.0,Core i3 12th Gen 1215U,182.0,,,,,,,
Operating_System,1624.0,2.0,Windows 11,1533.0,,,,,,,
RAM,1624.0,,,,12.23399,5.873543,4.0,8.0,8.0,16.0,32.0
Storage,1624.0,,,,526.857143,202.351364,32.0,512.0,512.0,512.0,2048.0
Storage_Type,1624.0,2.0,SSD,1533.0,,,,,,,
Screen_Size,1624.0,,,,19.899631,18.594559,11.0,11.0,12.0,16.0,81.0
Rating,1495.0,,,,4.203144,0.239443,3.3,4.1,4.2,4.3,5.0


### Observation
- Based on the above describe(Mean, precentiles and median) of the storage column, we can be sure that all the values are now accurate.

In [45]:
# Dealing with Screen Size outlier values
laptop_df[laptop_df["Screen_Size"] > 17].head()

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Processor,Operating_System,RAM,Storage,Storage_Type,Screen_Size,Rating,Number_of_Reviews,Price
12,HP,15s,AMD,Ryzen 3 Quad Core 5300U,Windows 11,8,512,SSD,64,4.2,405.0,32490.0
18,HP,Chromebook MediaTek MT8183,Chromebook,MediaTek MT8183,Chrome,4,32,EMMC,81,3.8,501.0,11990.0
36,HP,15s,AMD,Ryzen 3 Quad Core 5300U,Windows 11,8,512,SSD,64,4.2,405.0,32490.0
42,HP,Chromebook MediaTek MT8183,Chromebook,MediaTek MT8183,Chrome,4,32,EMMC,81,3.8,501.0,11990.0
60,HP,15s,AMD,Ryzen 3 Quad Core 5300U,Windows 11,8,512,SSD,64,4.2,405.0,32490.0


In [46]:
laptop_df[laptop_df["Screen_Size"] > 17].shape

(265, 12)

### Observation
- The Screen Size values have been mislabelled during the aggregation or were not present in the give data.
- In order to impute these outlier values, these values will be replaced by the mean value of screen size based on each laptop company

In [47]:
# Filtering out what all values are there in Screen Size
screen_size_counts = laptop_df['Screen_Size'].value_counts().sort_index()

In [48]:
screen_size_counts

Screen_Size
11    485
12    403
13    218
14      5
16    248
25     22
32     62
64    113
81     68
Name: count, dtype: int64

### Observation
- The above values confirm that some values are mis-represented, so we will be replacing it with the mean value based on each laptop.
- This will be done for screen size greater than 32, and for values between 17 and 32 will be converted to inches.

In [49]:
# Imputing outliers in Screen_Size Column

# Function to replace screen sizes greater than 32 with the median screen size for each brand
# and convert screen sizes between 17 and 32 from cm to inches
def update_screen_size(group):
    median_size = group['Screen_Size'].median()
    group.loc[group['Screen_Size'] > 32, 'Screen_Size'] = median_size
    group.loc[(group['Screen_Size'] > 17) & (group['Screen_Size'] <= 32), 'Screen_Size'] *= 0.393701
    return group

# Apply the function to each group of Laptop_Brand
laptop_df = laptop_df.groupby('Laptop_Brand').apply(update_screen_size).reset_index(drop=True)

In [50]:
# Filtering out what all values are there in Screen Size
screen_size_counts = laptop_df['Screen_Size'].value_counts().sort_index()
screen_size_counts

Screen_Size
9.842525      22
11.000000    507
12.000000    403
12.598432     62
13.000000    218
14.000000      5
16.000000    407
Name: count, dtype: int64

### Observation
- Need to replace all the screen size values less than 11 to 11 inches
- Make the the screen size to definitive 12.5 inches

In [51]:
# Replace specific Screen_Size values
laptop_df['Screen_Size'] = laptop_df['Screen_Size'].replace({9.842525: 11.00, 12.598432: 12.5})

# Filtering out what all values are there in Screen Size
screen_size_counts = laptop_df['Screen_Size'].value_counts().sort_index()
screen_size_counts

Screen_Size
11.0    529
12.0    403
12.5     62
13.0    218
14.0      5
16.0    407
Name: count, dtype: int64

### Observation
- All the screen sizes are now valid.

In [52]:
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1624 entries, 0 to 1623
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Laptop_Brand       1624 non-null   object 
 1   Laptop_Name        1624 non-null   object 
 2   Processor_Company  1624 non-null   object 
 3   Processor          1624 non-null   object 
 4   Operating_System   1624 non-null   object 
 5   RAM                1624 non-null   int64  
 6   Storage            1624 non-null   int64  
 7   Storage_Type       1624 non-null   object 
 8   Screen_Size        1624 non-null   float64
 9   Rating             1495 non-null   float64
 10  Number_of_Reviews  1495 non-null   float64
 11  Price              1624 non-null   float64
dtypes: float64(4), int64(2), object(6)
memory usage: 152.4+ KB


In [53]:
laptop_df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Laptop_Brand,1624.0,9.0,HP,388.0,,,,,,,
Laptop_Name,1624.0,47.0,Aspire 7,136.0,,,,,,,
Processor_Company,1624.0,4.0,Intel,1032.0,,,,,,,
Processor,1624.0,38.0,Core i3 12th Gen 1215U,182.0,,,,,,,
Operating_System,1624.0,2.0,Windows 11,1533.0,,,,,,,
RAM,1624.0,,,,12.23399,5.873543,4.0,8.0,8.0,16.0,32.0
Storage,1624.0,,,,526.857143,202.351364,32.0,512.0,512.0,512.0,2048.0
Storage_Type,1624.0,2.0,SSD,1533.0,,,,,,,
Screen_Size,1624.0,,,,12.836207,1.94802,11.0,11.0,12.0,16.0,16.0
Rating,1495.0,,,,4.203144,0.239443,3.3,4.1,4.2,4.3,5.0


# Observation
- Replacing all the missing values in `Rating` and `Number_of_Reviews` with zero, as they will be treated as the laptop not being sold or not that appealing to customers.

In [54]:
# Replacing Nan values in the Rating and Number of Reviews column
laptop_df['Rating'] = laptop_df['Rating'].fillna(0)
laptop_df['Number_of_Reviews'] = laptop_df['Number_of_Reviews'].fillna(0)

In [55]:
# Taking a final look at info and description of data
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1624 entries, 0 to 1623
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Laptop_Brand       1624 non-null   object 
 1   Laptop_Name        1624 non-null   object 
 2   Processor_Company  1624 non-null   object 
 3   Processor          1624 non-null   object 
 4   Operating_System   1624 non-null   object 
 5   RAM                1624 non-null   int64  
 6   Storage            1624 non-null   int64  
 7   Storage_Type       1624 non-null   object 
 8   Screen_Size        1624 non-null   float64
 9   Rating             1624 non-null   float64
 10  Number_of_Reviews  1624 non-null   float64
 11  Price              1624 non-null   float64
dtypes: float64(4), int64(2), object(6)
memory usage: 152.4+ KB


In [56]:
laptop_df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Laptop_Brand,1624.0,9.0,HP,388.0,,,,,,,
Laptop_Name,1624.0,47.0,Aspire 7,136.0,,,,,,,
Processor_Company,1624.0,4.0,Intel,1032.0,,,,,,,
Processor,1624.0,38.0,Core i3 12th Gen 1215U,182.0,,,,,,,
Operating_System,1624.0,2.0,Windows 11,1533.0,,,,,,,
RAM,1624.0,,,,12.23399,5.873543,4.0,8.0,8.0,16.0,32.0
Storage,1624.0,,,,526.857143,202.351364,32.0,512.0,512.0,512.0,2048.0
Storage_Type,1624.0,2.0,SSD,1533.0,,,,,,,
Screen_Size,1624.0,,,,12.836207,1.94802,11.0,11.0,12.0,16.0,16.0
Rating,1624.0,,,,3.869273,1.159917,0.0,4.075,4.2,4.3,5.0


In [57]:
# Since the data is now cleaned, it can be exported as a clean CSV for further analysis
laptop_df.to_csv(r"data\flipkart_laptop_cleaned.csv",index=False)