In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import re
from scipy.stats import mode

In [None]:
# Load data

df = pd.read_csv('Data/laptop_details.csv')
df.head()

Unnamed: 0,Product,Rating,MRP,Feature
0,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/256 ...,4.2,"₹36,990",Intel Core i3 Processor (11th Gen)8 GB DDR4 RA...
1,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 ...,4.2,"₹39,990",Intel Core i3 Processor (11th Gen)8 GB DDR4 RA...
2,ASUS VivoBook 15 (2022) Core i3 10th Gen - (8 ...,4.3,"₹32,990",Intel Core i3 Processor (10th Gen)8 GB DDR4 RA...
3,HP Pavilion Ryzen 5 Hexa Core AMD R5-5600H - (...,4.4,"₹49,990",AMD Ryzen 5 Hexa Core Processor8 GB DDR4 RAM64...
4,ASUS TUF Gaming F15 Core i5 10th Gen - (8 GB/5...,4.4,"₹49,990",Intel Core i5 Processor (10th Gen)8 GB DDR4 RA...


In [None]:
# Shape of the dataset

df.shape

(720, 4)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Product  720 non-null    object 
 1   Rating   581 non-null    float64
 2   MRP      720 non-null    object 
 3   Feature  720 non-null    object 
dtypes: float64(1), object(3)
memory usage: 22.6+ KB


In [None]:
# Check duplicate values
 
df.duplicated().sum()

297

In [None]:
df[df.duplicated()]

Unnamed: 0,Product,Rating,MRP,Feature
25,RedmiBook Pro Core i5 11th Gen - (8 GB/512 GB ...,4.1,"₹39,990",Intel Core i5 Processor (11th Gen)8 GB DDR4 RA...
26,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 ...,4.2,"₹39,990",Intel Core i3 Processor (11th Gen)8 GB DDR4 RA...
27,ASUS ROG Strix G15 (2022) with 90Whr Battery R...,,"₹1,96,990",AMD Ryzen 9 Octa Core Processor16 GB DDR5 RAMW...
29,realme Book (Slim) Core i3 11th Gen - (8 GB/25...,4.4,"₹46,990",Stylish & Portable Thin and Light Laptop14 inc...
30,Lenovo IdeaPad 1 Ryzen 5 Quad Core 3500U - (8 ...,4.2,"₹38,705",AMD Ryzen 5 Quad Core Processor8 GB DDR4 RAM64...
...,...,...,...,...
708,ASUS VivoBook 15 Core i5 10th Gen - (8 GB/1 TB...,3.9,"₹44,990",Intel Core i5 Processor (10th Gen)8 GB DDR4 RA...
710,Lenovo IdeaPad 1 Ryzen 3 Dual Core 3250U - (8 ...,4.2,"₹35,590",AMD Ryzen 3 Dual Core Processor8 GB DDR4 RAM64...
711,ASUS ROG Strix G15 Advantage Edition with 90Wh...,4.9,"₹1,50,990",AMD Ryzen 9 Octa Core Processor16 GB DDR4 RAMW...
716,Lenovo IdeaPad Gaming 3 Ryzen 7 Octa Core AMD ...,4.5,"₹73,490",AMD Ryzen 7 Octa Core Processor8 GB DDR4 RAM64...


In [None]:
# Drop duplicates

df = df.drop_duplicates(keep='first')     

df.shape

(423, 4)

### **Extracting features using Regex**

In [None]:
print(df.iloc[0,0])
print(df.iloc[0,-1])

print(df.iloc[1,0])
print(df.iloc[1,-1])

Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/256 GB SSD/Windows 11 Home) 14ITL05 Thin and Light Laptop
Intel Core i3 Processor (11th Gen)8 GB DDR4 RAM64 bit Windows 11 Operating System256 GB SSD35.56 cm (14 inch) DisplayOffice Home and Student 20212 Year Onsite�Warranty
Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 82H801L7IN | 82H802FJIN | 82H802...
Intel Core i3 Processor (11th Gen)8 GB DDR4 RAM64 bit Windows 11 Operating System512 GB SSD39.62 cm (15.6 inch) DisplayOffice Home and Student 20212 Year Onsite�Warranty


In [None]:
# Extract features

def extract_features(row):
    product = row['Product']
    feature = row['Feature']
    
    ram_size = re.findall(r'(\d+) ?(?:GB|TB)', feature)
    ram_size = int(ram_size[0]) if ram_size else None
    
    ram_type = re.findall(r'(?:LP)?DDR\d\S*|Unified\sMemory', feature)
    ram_type = ram_type[0] if ram_type else None

    display = re.findall(r'\d+(?:\.\d+)?\s*(?:cm|inch)\s*(?:\(|:)?\s*\d+(?:\.\d+)?\s*(?:cm|inch)?', feature)
    display = display[0] if display else None
    
    processor = re.findall(r'(?:AMD|Intel|M\d+|Qualcomm Snapdragon)[\s\w]*\b', feature)
    processor = processor[0] if processor else None

    storage = re.findall(r'(\d+) ?(?:GB|TB) ?(SSD|HDD)', feature)
    storage = list(set([(f"{s[0]} {s[1]}") for s in storage])) if storage else None
    storage = ", ".join(storage) if storage else None
  
    os = re.findall(r'(Windows|Mac OS|Linux|DOS|Chrome)[\s\w]', feature)
    os = os[0] if os else None
    
    brand = re.findall(r'^\w+', product)
    brand = brand[0] if brand else None
    
    return pd.Series([ram_size, ram_type, display, processor, storage, os, brand],
                     index=['Ram Size', 'Ram Type', 'Display', 'Processor', 'Storage', 'OS', 'Brand'])
    
df[['RAM_Size(GB)', 'RAM_Type', 'Display', 'Processor', 'Storage', 'OS', 'Brand']] = df.apply(extract_features, axis=1)

df.sample(3)

Unnamed: 0,Product,Rating,MRP,Feature,RAM_Size(GB),RAM_Type,Display,Processor,Storage,OS,Brand
122,DELL Ryzen 7 Octa Core AMD R7-5800H - (16 GB/5...,5.0,"₹89,081",NVIDIA RTX 3050 Graphics upto 90W TGP15.6 inch...,16,DDR4,39.62 cm (15.6 inch,AMD Ryzen 7 Octa Core Processor16 GB DDR4 RAM6...,512 SSD,Windows,DELL
394,Lenovo Ideapad Slim 5 Ryzen 7 Octa Core 4700U ...,4.4,"₹55,600",AMD Ryzen 7 Octa Core Processor8 GB DDR4 RAM64...,8,DDR4,35.56 cm (14 inch,AMD Ryzen 7 Octa Core Processor8 GB DDR4 RAM64...,512 SSD,Windows,Lenovo
121,acer Nitro 5 Ryzen 5 Hexa Core 5600H - (16 GB/...,4.3,"₹66,990",AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMW...,16,DDR4,39.62 cm (15.6,AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMW...,512 SSD,Windows,acer


In [None]:
df['RAM_Type'].value_counts()

DDR4              320
DDR5               42
LPDDR4X            30
Unified Memory     12
LPDDR5              9
LPDDR4              8
DDR4,               1
LPDDR3              1
Name: RAM_Type, dtype: int64

In [None]:
df['Storage'].value_counts()

512 SSD             223
1 SSD                74
256 SSD              63
1 HDD, 256 SSD       25
1 HDD                14
2 SSD                 6
128 SSD               4
512 SSD, 1 HDD        2
128 SSD, 1 HDD        2
256 HDD, 256 SSD      1
Name: Storage, dtype: int64

In [None]:
df['Brand'].value_counts()

ASUS         136
Lenovo        62
HP            57
DELL          56
acer          34
MSI           29
APPLE         19
Infinix       14
realme         4
ALIENWARE      3
RedmiBook      2
SAMSUNG        2
Ultimus        2
Vaio           1
GIGABYTE       1
Nokia          1
Name: Brand, dtype: int64

In [None]:
df['Processor'].value_counts()

Intel Core i5 Processor                                                                        109
Intel Core i3 Processor                                                                         69
Intel Core i7 Processor                                                                         52
AMD Ryzen 5 Hexa Core Processor8 GB DDR4 RAM64 bit Windows 11 Operating System512 GB SSD39      11
Intel Core i9 Processor                                                                         11
                                                                                              ... 
M1 Max Processor32 GB Unified Memory RAMMac OS Operating System1 TB SSD41                        1
AMD Ryzen 3 Dual Core Processor8 GB DDR4 RAM64 bit Windows 11 Operating System512 GB SSD35       1
AMD Ryzen 5 Quad Core Processor8 GB DDR4 RAM32 bit Windows 11 Operating System512 GB SSD35       1
AMD Ryzen 3                                                                                      1
AMD Ryzen 

In [None]:
df['Display'].value_counts()

39.62 cm (15.6 inch    172
39.62 cm (15.6          68
35.56 cm (14 inch       66
35.56 cm (14            28
33.78 cm (13.3 inch      9
33.78 cm (13.3           9
40.64 cm (16             9
43.94 cm (17.3           6
40.64 cm (16 inch        6
40.89 cm (16.1 inch      5
34.04 cm (13.4           5
41.15 cm (16.2 inch      5
43.94 cm (17.3 inch      4
35.81 cm (14.1 inch      4
29.46 cm (11.6 inch      4
38.1 cm (15 inch         4
29.46 cm (11.6           4
34.54 cm (13.6           3
40.89 cm (16.1           3
88.9 cm (35 cm           2
36.07 cm (14.2 inch      2
14 inch 2                1
33.02 cm (13 inch        1
38.0 cm (14.96           1
34.29 cm (13.5 inch      1
42.16 cm (16.6 inch      1
Name: Display, dtype: int64

In [None]:
# Clean columns

def clean_processor(processor):
    processor = re.sub(r'Processor.*', '', str(processor))
    processor = processor.rstrip()
    processor = re.sub(r'Intel i3', 'Intel Core i3', str(processor))
    processor = re.sub(r'Intel i7', 'Intel Core i7', str(processor))
    processor = re.sub(r'AMD Dual Core', 'AMD Ryzen 3 Dual Core', str(processor))
    processor = re.sub(r'AMD Ryzen R5', 'AMD Ryzen 5', str(processor))
    return processor

def convert_to_inches(display):
    # remove anything before a bracket
    display = re.sub(r'^.*\(', '', display)
    # remove anything after the word inch
    display = re.sub(r'\s*\S*$', '', display)
    # remove any remaining whitespace
    display = re.sub(r'\s', '', display)
    display = re.sub('35','13.78', str(display))
    display = display.replace('inch', '')
    return display

def clean_storage(storage):
    storage = re.sub(r'1 HDD','1 TB HDD', str(storage))
    storage = re.sub(r'1 SSD','128 SSD', str(storage))
    storage = re.sub(r'2 SSD','256 SSD', str(storage))
    storage = re.sub(r'128 SSD, 1 HDD','1 TB HDD, 128 SSD', str(storage))
    storage = re.sub(r'256 HDD, 256 SSD','1 TB HDD, 256 SSD', str(storage))
    storage = re.sub(r'51256 SSD','512 SSD', str(storage))
    storage = re.sub(r'None','512 SSD', str(storage))
    return storage

df['Processor'] = df['Processor'].apply(clean_processor)

df['Display'] = df['Display'].apply(convert_to_inches)
df['Display'] = df['Display'].astype('float')

df['Storage'] = df['Storage'].apply(clean_storage)

df['MRP'] = df['MRP'].str.replace('₹', '')
df['MRP'] = df['MRP'].str.replace(',', '')
df['MRP'] = df['MRP'].astype('int')

df['RAM_Type'] = df['RAM_Type'].str.replace(',', '')

df.head(2)

Unnamed: 0,Product,Rating,MRP,Feature,RAM_Size(GB),RAM_Type,Display,Processor,Storage,OS,Brand
0,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/256 ...,4.2,36990,Intel Core i3 Processor (11th Gen)8 GB DDR4 RA...,8,DDR4,14.0,Intel Core i3,256 SSD,Windows,Lenovo
1,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 ...,4.2,39990,Intel Core i3 Processor (11th Gen)8 GB DDR4 RA...,8,DDR4,15.6,Intel Core i3,512 SSD,Windows,Lenovo


In [None]:
df['Processor'].value_counts()

Intel Core i5                   110
Intel Core i3                    73
Intel Core i7                    53
AMD Ryzen 5 Hexa Core            40
AMD Ryzen 7 Octa Core            38
Intel Celeron Dual Core          18
AMD Ryzen 3 Dual Core            17
AMD Ryzen 9 Octa Core            16
Intel Core i9                    11
AMD Ryzen 5 Quad Core             8
M1                                7
M1 Pro                            6
M2                                5
AMD Ryzen 3 Quad Core             3
AMD Ryzen 5                       2
Intel Celeron Quad Core           2
AMD Athlon Dual Core              2
Intel Pentium Silver              2
Intel Pentium Quad Core           2
Intel Evo Core i5                 1
Qualcomm Snapdragon 7c Gen 2      1
Intel PQC                         1
AMD Ryzen 3                       1
M1 Max                            1
AMD Ryzen 7 Quad Core             1
AMD Ryzen 5 Dual Core             1
AMD Ryzen 3 Hexa Core             1
Name: Processor, dtype: int6

In [None]:
df['Storage'].value_counts()

512 SSD              232
128 SSD               78
256 SSD               69
1 TB HDD, 256 SSD     26
1 TB HDD              14
512 SSD, 1 TB HDD      2
128 SSD, 1 TB HDD      2
Name: Storage, dtype: int64

In [None]:
df['Storage'].describe()

count         423
unique          7
top       512 SSD
freq          232
Name: Storage, dtype: object

In [None]:
# Remove unnecessary columns

df = df.drop(['Product','Rating', 'Feature'], axis=1)
df = df.reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MRP           423 non-null    int64  
 1   RAM_Size(GB)  423 non-null    int64  
 2   RAM_Type      423 non-null    object 
 3   Display       423 non-null    float64
 4   Processor     423 non-null    object 
 5   Storage       423 non-null    object 
 6   OS            423 non-null    object 
 7   Brand         423 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 26.6+ KB


In [None]:
df.to_csv('cleaned_data.csv', index=False)