# **Flipkart Laptop Data - Business Insights on Product Pricing**

The data used in this notebook was scrapped from Flipkart on 21-12-2022. 

This notebook aims to **mine laptop features from the data for analysis and price prediction**. 


In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import re
from scipy.stats import mode

In [None]:
# Load data

df = pd.read_csv('..\data\laptop_details.csv')
df.head()

In [None]:
# Shape of the dataset

df.shape

In [None]:
df.info()

In [None]:
# Check duplicate values
 
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
# Drop duplicates

df = df.drop_duplicates(keep='first')     

df.shape

### **Extracting features using Regex**

In [None]:
print(df.iloc[0,0])
print(df.iloc[0,-1])

print(df.iloc[1,0])
print(df.iloc[1,-1])

In [None]:
# Extract features

def extract_features(row):
    product = row['Product']
    feature = row['Feature']
    
    ram_size = re.findall(r'(\d+) ?(?:GB|TB)', feature)
    ram_size = int(ram_size[0]) if ram_size else None
    
    ram_type = re.findall(r'(?:LP)?DDR\d\S*|Unified\sMemory', feature)
    ram_type = ram_type[0] if ram_type else None

    display = re.findall(r'\d+(?:\.\d+)?\s*(?:cm|inch)\s*(?:\(|:)?\s*\d+(?:\.\d+)?\s*(?:cm|inch)?', feature)
    display = display[0] if display else None
    
    processor = re.findall(r'(?:AMD|Intel|M\d+|Qualcomm Snapdragon)[\s\w]*\b', feature)
    processor = processor[0] if processor else None

    storage = re.findall(r'(\d+) ?(?:GB|TB) ?(SSD|HDD)', feature)
    storage = list(set([(f"{s[0]} {s[1]}") for s in storage])) if storage else None
    storage = ", ".join(storage) if storage else None
  
    os = re.findall(r'(Windows|Mac OS|Linux|DOS|Chrome)[\s\w]', feature)
    os = os[0] if os else None
    
    brand = re.findall(r'^\w+', product)
    brand = brand[0] if brand else None
    
    return pd.Series([ram_size, ram_type, display, processor, storage, os, brand],
                     index=['Ram Size', 'Ram Type', 'Display', 'Processor', 'Storage', 'OS', 'Brand'])
    
df[['RAM_Size', 'RAM_Type', 'Display', 'Processor', 'Storage', 'OS', 'Brand']] = df.apply(extract_features, axis=1)

df.sample(3)

In [None]:
df['RAM_Type'].value_counts()

In [None]:
df['Storage'].value_counts()

In [None]:
df['Brand'].value_counts()

In [None]:
df['Processor'].value_counts()

In [None]:
df['Display'].value_counts()

**Clean Columns**

In [None]:
def convert_to_inches(display):
    # remove anything before a bracket
    display = re.sub(r'^.*\(', '', display)
    # remove anything after the word inch
    display = re.sub(r'\s*\S*$', '', display)
    # remove any remaining whitespace
    display = re.sub(r'\s', '', display)
    display = re.sub('35','13.78', str(display))
    display = display.replace('inch', '')
    return display

df['Display'] = df['Display'].apply(convert_to_inches)
df['Display'] = df['Display'].astype('float')

In [None]:
df['MRP'] = df['MRP'].str.replace('₹', '')
df['MRP'] = df['MRP'].str.replace(',', '')
df['MRP'] = df['MRP'].astype('int')

df['RAM_Type'] = df['RAM_Type'].str.replace(',', '')

df.head(2)

In [None]:
def clean_processor(processor):
    processor = re.sub(r'Processor.*', '', str(processor))
    processor = processor.rstrip()
    processor = re.sub(r'Intel i3', 'Intel Core i3', str(processor))
    processor = re.sub(r'Intel i7', 'Intel Core i7', str(processor))
    processor = re.sub(r'AMD Dual Core', 'AMD Ryzen 3 Dual Core', str(processor))
    processor = re.sub(r'AMD Ryzen R5', 'AMD Ryzen 5', str(processor))
    processor = re.sub(r'Intel PQC', 'Intel Pentium Quad Core', str(processor))
    return processor

df['Processor'] = df['Processor'].apply(clean_processor)

In [None]:
df['Processor'].value_counts()

In [None]:
def clean_storage(storage):
    storage = re.sub(r'1 HDD','1 TB HDD', str(storage))
    storage = re.sub(r'128 SSD','128 GB SSD', str(storage))
    storage = re.sub(r'256 SSD','256 GB SSD', str(storage))
    storage = re.sub(r'1 SSD','128 GB SSD', str(storage))
    storage = re.sub(r'2 SSD','256 GB SSD', str(storage))
    storage = re.sub(r'128 SSD, 1 HDD','1 TB HDD, 128 GB SSD', str(storage)) 
    storage = re.sub(r'256 SSD, 1 HDD','1 TB HDD, 256 GB SSD', str(storage))
    storage = re.sub(r'51256','512', str(storage))
    storage = re.sub(r'256 HDD, 256 GB SSD','1 TB HDD, 256 GB SSD', str(storage))
    storage = re.sub(r'None','512 GB SSD', str(storage))
    return storage 

df['Storage'] = df['Storage'].apply(clean_storage)

In [None]:
df['Storage'].value_counts()

In [None]:
df['Storage'].describe()

In [None]:
# Remove unnecessary columns

df = df.drop(['Product','Rating', 'Feature'], axis=1)
df = df.reset_index(drop=True)

df.info()

In [None]:
df.to_csv('..\data\cleaned_data.csv', index = False)