# **Laptop Price Predictor**

Data Source: https://github.com/campusx-official/laptop-price-predictor-regression-project/blob/main/laptop_data.csv

**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Import Data**

In [None]:
df = pd.read_csv('https://github.com/campusx-official/laptop-price-predictor-regression-project/raw/main/laptop_data.csv')

In [None]:
df.head()


In [None]:
df.shape

In [None]:
df.info()

**Checking missing values or duplicate rows.**

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

**Optimising Dataset** 
* remove column"unnamed" it is unnecessary
* remove GB in RAM anf KG in weight and make the values int


In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df['Ram'] = df['Ram'].str.replace('GB','')
df['Weight'] = df['Weight'].str.replace('kg','')
df['Ram'] = df['Ram'].astype('int32')
df['Weight'] = df['Weight'].astype('float32')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
sns.distplot(df['Price'])

This plot shows number of laptops in each price range.

In [None]:
df['Company'].value_counts().plot(kind='bar')

This plot shows number of laptops of each company.

In [None]:
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

This plot shows the average price of laptop for each company.

In [None]:
df['TypeName'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['TypeName'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.distplot(df['Inches'])

# Feature engineering 
Studying the "ScreenResolution" column and making out more attributes like 'touchscreen','IPS display' etc. from it using lamba function.

In [None]:
df['ScreenResolution'].value_counts()


In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)

In [None]:
df.sample(2)

A new column named "touchscreen" has been added to the dataframe.

In [None]:
df['Touchscreen'].value_counts().plot(kind='bar')

This plot shows the number of touchscreen laptops .

In [None]:
sns.barplot(x=df['Touchscreen'],y=df['Price'])

This plot shows that the touchscreen Laptops are ussually expensive than the non touchscreen laptops.

In [None]:
df['Ips'] = df['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)

In [None]:
df.sample(2)

In [None]:
df['Ips'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ips'],y=df['Price'])

This shows IPS displays cost usually more.

**Making two columns containing X resolution and Y resolution separtely.**

In [None]:
new = df['ScreenResolution'].str.split('x',expand=True)
new.head()

In [None]:
df['X_res'] = new[0]
df['Y_res'] = new[1]
df.sample(5)

In [None]:
df['X_res'] = df['X_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])
df.sample(5)

convert type of X and Y resolution to INT

In [None]:
df['X_res'] = df['X_res'].astype('int')
df['Y_res'] = df['Y_res'].astype('int')
df.info()

In [None]:
df.corr(numeric_only=True)['Price']

In [None]:
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')
df.corr(numeric_only=True)['Price']

now we dont need the screen resolution column so now we can remove it.
we now have PPI so we can drop Inches , Xre ans Yres also.


In [None]:
df.drop(columns=['ScreenResolution'],inplace=True)
df.drop(columns=['Inches','X_res','Y_res'],inplace=True)

In [None]:
df.head()

**Feature engineering in CPU** 

In [None]:
df['Cpu'].value_counts()

In [None]:
df['Cpu Name'] = df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))
df.head()

Function to Optimize CPU value. > will make the values as Intel i5/i7, other Intel, AMD etc.

In [None]:
def fetch_processor(text):
    if text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Process'

In [None]:
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df.sample(5)

In [None]:
df['Cpu brand'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Cpu brand'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df.drop(columns=['Cpu','Cpu Name'],inplace=True)
df.head()

**RAM Correlations**

In [None]:
df['Ram'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ram'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

**Feature engineering on Memory**
making differnt categories like hdd ssd hybrid from memory data

In [None]:
df['Memory'].value_counts()

In [None]:
df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '')
df["Memory"] = df["Memory"].str.replace('TB', '000')
new = df["Memory"].str.split("+", n = 1, expand = True)

df["first"]= new[0]
df["first"]=df["first"].str.strip()

df["second"]= new[1]


df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df.head()

In [None]:
import re
#df['first'] = df['first'].str.replace("",'')
#df['first'] = df['first'].apply(lambda x: x.replace(r'\D', '5'))
df['first'] = df['first'].apply(lambda x: re.sub(r'\D', ' ', x))
df["second"].fillna("0", inplace = True)
df.head()

In [None]:

df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)
df.head()

In [None]:
#df['second'] = df['second'].str.replace(r'\D', '')
df['second'] = df['second'].apply(lambda x: re.sub(r'\D', ' ', x))
df.head()

In [None]:
df["first"] = df["first"].astype(int)
df["second"] = df["second"].astype(int)
df["HDD"]=(df["first"]*df["Layer1HDD"]+df["second"]*df["Layer2HDD"])
df["SSD"]=(df["first"]*df["Layer1SSD"]+df["second"]*df["Layer2SSD"])
df["Hybrid"]=(df["first"]*df["Layer1Hybrid"]+df["second"]*df["Layer2Hybrid"])
df["Flash_Storage"]=(df["first"]*df["Layer1Flash_Storage"]+df["second"]*df["Layer2Flash_Storage"])
df.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
       'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
       'Layer2Flash_Storage'],inplace=True)
df.sample(5)

In [None]:
df.drop(columns=['Memory'],inplace=True)
df.head()

In [None]:
df.corr(numeric_only=True)['Price']

hybrid and flash storage has bad corr factor so we can drop them

In [None]:
df.drop(columns=['Hybrid','Flash_Storage'],inplace=True)
df.head()

**Feature engineering on GPU**
issue: so many different values , so we need to make them into less categories like intel amd nvidea etc


In [None]:
df['Gpu'].value_counts()

In [None]:
df['Gpu brand'] = df['Gpu'].apply(lambda x:x.split()[0])
df.head()

In [None]:
df['Gpu'].value_counts()

In [None]:
df['Gpu brand'].value_counts()

In [None]:
df = df[df['Gpu brand'] != 'ARM']
df['Gpu brand'].value_counts()

In [None]:
sns.barplot(x=df['Gpu brand'],y=df['Price'],estimator=np.median)
plt.xticks(rotation='vertical')
plt.show()

Laptops with nvidea are max expensive.

In [None]:
df['Gpu'].value_counts()

In [None]:
df.drop(columns=['Gpu'],inplace=True)
df.head()

**Feature engineering on OS**

In [None]:
df['OpSys'].value_counts()

In [None]:
sns.barplot(x=df['OpSys'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

Defining a function to categorise os into mainly windows , mac ,linux etc

In [None]:
def cat_os(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'Windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
    
df['os'] = df['OpSys'].apply(cat_os)
df.head()

In [None]:
df.drop(columns=['OpSys'],inplace=True)

In [None]:
df.head()

In [None]:
sns.barplot(x=df['os'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.distplot(df['Weight'])

In [None]:
sns.scatterplot(x=df['Weight'],y=df['Price'])

very weak linear relation

In [None]:
df.corr(numeric_only=True)['Price']

In [None]:
df.corr(numeric_only=True)

can be plot like this

In [None]:
sns.heatmap(df.corr(numeric_only=True))

In [None]:
sns.distplot(df['Price'])

it is skewed so we apply log transformation on it

In [None]:
sns.distplot(np.log(df['Price']))

In [None]:
X = df.drop(columns=['Price'])
y = np.log(df['Price'])

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=2)
X_train

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# Linear Regression

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

# KNN

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

# Random Forest

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

# Exporting the model

In [None]:
import pickle

pickle.dump(df,open('df.pkl','wb'))
pickle.dump(pipe,open('pipe.pkl','wb'))

In [None]:
df

In [None]:
X_train