In [None]:
pip install scikit-learn

In [None]:
# importing necessary modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

In [None]:
data=pd.read_csv('laptop.csv',encoding='UTF-8')         # importing the dataset using pandas
data.head()                             # shows up first 5 rows of dataset

In [None]:
data.info()     # to know the information of the dataset

In [None]:
data=data.drop(['Unnamed: 0.1','Unnamed: 0'],axis=1)          # droping unnecessary columns
data

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates()

In [None]:
data['Ram'] = data['Ram'].str.replace('GB','').astype('int32')
data['Weight'] = data['Weight'].str.replace('kg','').astype('float32')
data['Inches']=data['Inches'].str.replace('?','15.6').astype('float32')


In [None]:
data['Inches'].mode()

In [None]:
data.info()

# Exploratory data analysis

In [None]:
# distribution of target column

plt.figure(figsize=(10,4))
sns.distplot(data['Price'],color='red')
plt.show()

In [None]:
#what is avg price of each brand?
plt.figure(figsize=(10,4))
sns.barplot(x=data['Company'], y=data['Price'])
plt.xticks(rotation="vertical")
plt.show()

In [None]:
data['TypeName'].value_counts()


In [None]:
# Type of Laptop
sns.barplot(x=data['TypeName'], y=data['Price'])
plt.xticks(rotation="vertical")
plt.show()


In [None]:
# Price variation for different inches of laptops

plt.figure(figsize=(15,7))
sns.scatterplot(x = data['Inches'],y = data['Price'])
plt.show()


In [None]:
# Screen resolution (Feature engineering)
data['ScreenResolution'].value_counts()

# Creating new column : TouchScreen


In [None]:
data['TouchScreen'] = data['ScreenResolution'].apply(lambda element: 1 if 'Touchscreen' in element else 0)
data.sample(10)


In [None]:
data['TouchScreen'].value_counts()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x=data['TouchScreen'],palette='plasma')
plt.show()

In [None]:
# Price variation on Touch screen laptops

plt.figure(figsize=(10,4))
sns.barplot(x = data['TouchScreen'],y = data['Price'])
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
#  Creating nuw column : IPS
data['IPS'] = data['ScreenResolution'].apply(lambda element:1 if "IPS" in element else 0)
data.sample(5)

In [None]:

data['IPS'] = data['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)
sns.barplot(x=data['IPS'],y=data['Price'])
plt.show()

In [None]:
def findXresolution(s):
  return s.split()[-1].split("x")[0]
def findYresolution(s):
  return s.split()[-1].split("x")[1]

#finding the x_res and y_res from screen resolution
data['X_res'] = data['ScreenResolution'].apply(lambda x: findXresolution(x))
data['Y_res'] = data['ScreenResolution'].apply(lambda y: findYresolution(y))

#convert to numeric
data['X_res'] = data['X_res'].astype('int')
data['Y_res'] = data['Y_res'].astype('int')

In [None]:
data.info()

In [None]:
data['PPI'] = (((data['X_res']**2) + (data['Y_res']**2))**0.5/data['Inches']).astype('float')
data.corr(numeric_only=True)['Price'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(data.corr(numeric_only=True),annot=True,cmap='plasma')
plt.show()

In [None]:
data.drop(columns = ['ScreenResolution', 'Inches','X_res','Y_res'], inplace=True)
data.head()

In [None]:
# CPU Analysis
data['Cpu'].value_counts()

In [None]:
data['CPU'] = data['Cpu'].apply(lambda text:" ".join(text.split()[:3]))
data['CPU'].unique()

In [None]:
def processortype(text):
    if text=='Intel Core i7' or text=='Intel Core i5' or text=='Intel Core i3':
        return text
    else:
        if text.split()[0]=='Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'
    
data['CPU'] = data['CPU'].apply(lambda text:processortype(text))
data.head()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x=data['CPU'],palette='plasma')
plt.xticks(rotation = 'vertical')
plt.show()


In [None]:
plt.figure(figsize=(15,7))
sns.barplot(x=data['CPU'],y=data['Price'])
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
data.drop(columns=['Cpu'],inplace=True)


In [None]:
# RAM Analysis
sns.countplot(x=data['Ram'],palette='autumn')
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(x=data['Ram'],y=data['Price'])
plt.show()


In [None]:
# MEMORY column
data['Memory'].value_counts()

In [None]:
data['Memory'] = data['Memory'].astype('str').replace('\.0','',regex = True)
data['Memory'] = data['Memory'].str.replace('GB','')
data['Memory'] = data['Memory'].str.replace('TB','000')
newdata = data['Memory'].str.split("+",n = 1,expand = True)
newdata

In [None]:
data['first'] = newdata[0]
data['first'] = data['first'].str.strip()
data.head()

In [None]:
# Creating Layer 1 column for each memory type
def applychanges(value):
    data['Layer1'+value] = data['first'].apply(lambda x: 1 if value in x else 0)
      
listtoapply = ['HDD','SSD','Hybrid','Flash Storage']    
for value in listtoapply:
    applychanges(value)
       
data.sample(5)

In [None]:
# We will remove all the character and just keeping the numbers
listtoapply = ['HDD','SSD','Hybrid','Flash Storage']    
for value in listtoapply:
    data['first'] = data['first'].str.replace(value,'')

In [None]:
data['first'].value_counts()

In [None]:
data['Second'] = newdata[1]
data.head()

In [None]:
#Creating Layer 2 column for each memory type
def applychanges1(value):
    data['Layer2'+value] = data['Second'].apply(lambda x:1 if value in x else 0)
        
listtoapply1 = ['HDD','SSD','Hybrid','FlashStorage']
data['Second'] = data['Second'].fillna("0")
for value in listtoapply1:
    applychanges1(value)
#We will remove all the characters and just keeping the numbers again
for value in listtoapply1:
    data['Second'] = data['Second'].str.replace(value,'')
    
    
data['Second'].value_counts()

In [None]:
data['Second'].unique()

In [None]:
data['first']=data['first'].str.replace('?','256')

In [None]:
data['first'] = data['first'].astype('int')
data['Second'] = data['Second'].astype('int')
data.sample(10)

In [None]:
# Multiplying the elements and storing in subsequent columns¶
data["HDD"]=(data["first"]*data["Layer1HDD"]+data["Second"]*data["Layer2HDD"])
data["SSD"]=(data["first"]*data["Layer1SSD"]+data["Second"]*data["Layer2SSD"])
data["Hybrid"]=(data["first"]*data["Layer1Hybrid"]+data["Second"]*data["Layer2Hybrid"])
data["Flash_Storage"]=(data["first"]*data["Layer1Flash Storage"]+data["Second"]*data["Layer2FlashStorage"])


In [None]:
# Dropping unecessary columns

data.drop(columns=['first', 'Second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
       'Layer1Flash Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
       'Layer2FlashStorage'],inplace=True)

In [None]:
data.info()

In [None]:
data.sample()

In [None]:
data.drop(columns=['Hybrid','Flash_Storage','Memory'],inplace=True)
data.sample()

In [None]:
data['Gpu'].unique()

In [None]:

# Which brand GPU is in laptop
plt.figure(figsize=(15,7))
data['Gpu_brand'] = data['Gpu'].apply(lambda x:x.split()[0])
sns.countplot(x=data['Gpu_brand'],palette='plasma')
plt.show()

In [None]:
data= data[data['Gpu_brand']!='ARM']
plt.figure(figsize=(15,7))
sns.barplot(x=data['Gpu_brand'],y=data['Price'],estimator=np.median)
plt.show()


In [None]:
# Thankfully, this data doesn't make me go crazy. Next we can safely drop the old GPU column

data = data.drop(columns=['Gpu'])
data.head()

In [None]:
data['OpSys'].unique()

In [None]:
# Barplot - Price variation on OS in laptops
plt.figure(figsize=(15,7))
sns.barplot(x=data['OpSys'],y=data['Price'])
plt.xticks(rotation = 'vertical')
plt.show()


In [None]:
def OS_category(text):
    if text=='Windows 10' or text=='Windows 7' or text=='Windows 10 S':
        return 'Windows'
    elif text=='Mac OS X' or text=='macOS':
        return 'Mac'
    else:
        return 'Other'
    
data['OpSys'] = data['OpSys'].apply(lambda x:OS_category(x))
data.head()

In [None]:

sns.countplot(x=data['OpSys'],palette='plasma')
plt.show()

In [None]:
# Barplot - Price variation on OS types
plt.figure(figsize=(15,7))
sns.barplot(x = data['OpSys'],y = data['Price'])
plt.show()

In [None]:
# Weight analysis
# Distribution plot on weight column

plt.figure(figsize=(10,4))
sns.distplot(x=data['Weight'])
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(np.log(data['Price']))
plt.show()


In [None]:
# Heatmap - Correlation for each features
sns.heatmap(data.corr(numeric_only=True),annot=True,cmap='plasma')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV 
from sklearn.model_selection import RepeatedKFold

In [None]:
Y = np.log(data['Price'])
X = data.drop(columns=['Price'],axis=1)


X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=45)
X_train.shape,X_test.shape

In [None]:
# Linear Regression 
step1 = ColumnTransformer(transformers=[('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,3,8,11])],remainder='passthrough')
step2 = LinearRegression()
pipe=Pipeline([('step1',step1),('step2',step2)])
pipe.fit(X_train,Y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(Y_test,y_pred))
print('MAE',mean_absolute_error(Y_test,y_pred))

In [None]:
# RandomForestRegressor

step1 = ColumnTransformer(transformers=[('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,3,8,11])],remainder='passthrough')
step2 = RandomForestRegressor(n_estimators=100,random_state=3,max_samples=0.5,max_features=0.75,max_depth=15)
pipe = Pipeline([('step1',step1),('step2',step2)])
pipe.fit(X_train,Y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(Y_test,y_pred))
print('MAE',mean_absolute_error(Y_test,y_pred))

In [None]:
step1 = ColumnTransformer(transformers=[('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,3,8,11])],remainder='passthrough')
step2 = GradientBoostingRegressor(n_estimators=100)
pipe = Pipeline([('step1',step1),('step2',step2)])
pipe.fit(X_train,Y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(Y_test,y_pred))
print('MAE',mean_absolute_error(Y_test,y_pred))

In [None]:
data.info()


In [None]:
data.info()


In [None]:
import joblib

# Assuming 'data' is your trained machine learning model
joblib.dump(data, 'laptop_price_model.pkl')


In [None]:
!pip install streamlit

In [None]:
!streamlit run streamlit_app.py