In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import warnings



In [2]:
df = pd.read_csv("laptop_final.csv")
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,ppi,Cpu Brand,HDD,SSD,Hybrid,Flash_Storage,Gpu brand,os
0,Apple,Ultrabook,8,1.37,71378.6832,0,1,226.983005,Intel Core i5,0,128,0,0,Intel,Mac
1,Apple,Ultrabook,8,1.34,47895.5232,0,0,127.67794,Intel Core i5,0,0,0,128,Intel,Mac
2,HP,Notebook,8,1.86,30636.0,0,0,141.211998,Intel Core i5,0,256,0,0,Intel,Others/No Os/Linux
3,Apple,Ultrabook,16,1.83,135195.336,0,1,220.534624,Intel Core i7,0,512,0,0,AMD,Mac
4,Apple,Ultrabook,8,1.37,96095.808,0,1,226.983005,Intel Core i5,0,256,0,0,Intel,Mac


In [3]:
df

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,ppi,Cpu Brand,HDD,SSD,Hybrid,Flash_Storage,Gpu brand,os
0,Apple,Ultrabook,8,1.37,71378.6832,0,1,226.983005,Intel Core i5,0,128,0,0,Intel,Mac
1,Apple,Ultrabook,8,1.34,47895.5232,0,0,127.677940,Intel Core i5,0,0,0,128,Intel,Mac
2,HP,Notebook,8,1.86,30636.0000,0,0,141.211998,Intel Core i5,0,256,0,0,Intel,Others/No Os/Linux
3,Apple,Ultrabook,16,1.83,135195.3360,0,1,220.534624,Intel Core i7,0,512,0,0,AMD,Mac
4,Apple,Ultrabook,8,1.37,96095.8080,0,1,226.983005,Intel Core i5,0,256,0,0,Intel,Mac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,Lenovo,2 in 1 Convertible,4,1.80,33992.6400,1,1,157.350512,Intel Core i7,0,128,0,0,Intel,Windows
1298,Lenovo,2 in 1 Convertible,16,1.30,79866.7200,1,1,276.053530,Intel Core i7,0,512,0,0,Intel,Windows
1299,Lenovo,Notebook,2,1.50,12201.1200,0,0,111.935204,Other Intel Processor,0,0,0,64,Intel,Windows
1300,HP,Notebook,6,2.19,40705.9200,0,0,100.454670,Intel Core i7,1000,0,0,0,AMD,Windows


In [4]:

df = df.rename(columns={'Cpu Brand': 'Cpu_Brand'})
df = df.rename(columns={'Gpu brand': 'Gpu_Brand'})
df = df.rename(columns={'HDD': 'Hdd'})
df = df.rename(columns={'SSD': 'Ssd'})
df.columns


Index(['Company', 'TypeName', 'Ram', 'Weight', 'Price', 'TouchScreen', 'Ips',
       'ppi', 'Cpu_Brand', 'Hdd', 'Ssd', 'Hybrid', 'Flash_Storage',
       'Gpu_Brand', 'os'],
      dtype='object')

In [5]:
sample = {'Company': ['HP'], 
          'TypeName':['Ultrabook'], 
          'Ram':[4],
          'Weight':[1.3],
          'TouchScreen':[0],
          'Ips':[1],
          'ppi':[226.983005], 
          'Cpu_Brand':['Intel Core i3'], 
          'Hdd':[0], 
          'Ssd':[128], 
          'Hybrid':[0], 
          'Flash_Storage':[0],
         'Gpu_Brand': ['Intel'],
         'os':['Windows']}
samp = pd.DataFrame(sample)
samp

Unnamed: 0,Company,TypeName,Ram,Weight,TouchScreen,Ips,ppi,Cpu_Brand,Hdd,Ssd,Hybrid,Flash_Storage,Gpu_Brand,os
0,HP,Ultrabook,4,1.3,0,1,226.983005,Intel Core i3,0,128,0,0,Intel,Windows


In [6]:
df['Hybrid'].unique()

array([   0, 1000,  508], dtype=int64)

In [7]:
df['target'] = df['Price']
df.drop(['Price'], axis=1, inplace=True)
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,TouchScreen,Ips,ppi,Cpu_Brand,Hdd,Ssd,Hybrid,Flash_Storage,Gpu_Brand,os,target
0,Apple,Ultrabook,8,1.37,0,1,226.983005,Intel Core i5,0,128,0,0,Intel,Mac,71378.6832
1,Apple,Ultrabook,8,1.34,0,0,127.67794,Intel Core i5,0,0,0,128,Intel,Mac,47895.5232
2,HP,Notebook,8,1.86,0,0,141.211998,Intel Core i5,0,256,0,0,Intel,Others/No Os/Linux,30636.0
3,Apple,Ultrabook,16,1.83,0,1,220.534624,Intel Core i7,0,512,0,0,AMD,Mac,135195.336
4,Apple,Ultrabook,8,1.37,0,1,226.983005,Intel Core i5,0,256,0,0,Intel,Mac,96095.808


In [8]:
df['Hybrid'].value_counts()

Hybrid
0       1290
1000      11
508        1
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X = df.drop(['target'], axis=1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=10)


## <font color = "E38B29"> pipeline model using bagging




In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [11]:
ncols =  X.select_dtypes(include=['int64','float64']).columns
ccols = X_train.select_dtypes(include=['object']).columns

In [12]:
ntransformer = Pipeline(steps=[
    ("imputeN",SimpleImputer(strategy='mean')),
    ("scale",StandardScaler())
])

In [13]:
ctransformer = Pipeline(steps=[
    ("imputeN",SimpleImputer(strategy='most_frequent')),
    ("encode",OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer(transformers=[
    ('numerical',ntransformer,ncols),
    ('categorical',ctransformer,ccols)
])

In [15]:
#Lets get estimator first
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

#Create estimator
estimator = KNeighborsRegressor(n_neighbors=3, weights='distance', algorithm='kd_tree')

In [16]:
bagging = BaggingRegressor(base_estimator=estimator, n_estimators=10, max_samples=0.5)


In [17]:
#Now add estimator to pipeline
pipelinemodel = Pipeline(steps=[
    ('pre',preprocessing),
    ('est',bagging)
])


In [18]:
from sklearn import set_config
set_config(display='diagram')
pipelinemodel.fit(X_train, y_train)


In [19]:
 ypred = pipelinemodel.predict(X_test)


In [20]:
res = pipelinemodel.predict(samp)
res

array([57056.46698822])

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


mae = mean_absolute_error(y_test, ypred)
mse = mean_squared_error(y_test, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ypred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 11957.985511289884
Mean Squared Error: 303330296.74928695
Root Mean Squared Error: 17416.38012760651
R-squared: 0.7395252783895181


In [22]:
print(pipelinemodel.score(X_train, y_train))
print(pipelinemodel.score(X_test, y_test))

0.9006424329239868
0.7395252783895181


## <font color = "E38B29"> pipeline model using voting




In [23]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso


model1 = DecisionTreeRegressor()
model2 = LinearRegression()
model3 = KNeighborsRegressor()
model4 = Lasso(alpha=0.1)
from sklearn.ensemble import VotingRegressor
ensemble = VotingRegressor(estimators=[('dt', model1), ('lr', model2), ('kn', model3),("la", model4)])


In [24]:
pipelinemodel1 = Pipeline(steps=[
    ('pre',preprocessing),
    ('est',ensemble)
])

In [25]:
from sklearn import set_config
set_config(display='diagram')
pipelinemodel1.fit(X_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [26]:
 ypred1 = pipelinemodel1.predict(X_test)


In [27]:
mae = mean_absolute_error(y_test, ypred1)
mse = mean_squared_error(y_test, ypred1)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ypred1)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 10475.293460413846
Mean Squared Error: 220267139.11279225
Root Mean Squared Error: 14841.39949980433
R-squared: 0.8108529798862675


In [28]:
print(pipelinemodel1.score(X_train, y_train))
print(pipelinemodel1.score(X_test, y_test))

0.8999076972875585
0.8108529798862675


## <font color = "E38B29"> pipeline model using stacking




In [29]:
from sklearn.ensemble import StackingRegressor
m1 = DecisionTreeRegressor()
m2 = KNeighborsRegressor()
meta = LinearRegression()
stack = StackingRegressor(estimators=[('dt',m1),('km',m2)], final_estimator=meta)


In [30]:
pipelinemodel2 = Pipeline(steps=[
    ('pre',preprocessing),
    ('est',stack)
])

In [31]:
from sklearn import set_config
set_config(display='diagram')
pipelinemodel2.fit(X_train, y_train)

In [32]:
 ypred2 = pipelinemodel2.predict(X_test)

In [33]:
mae = mean_absolute_error(y_test, ypred2)
mse = mean_squared_error(y_test, ypred2)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ypred2)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 11560.546543858349
Mean Squared Error: 284380427.8353629
Root Mean Squared Error: 16863.58288844227
R-squared: 0.7557978429266149


In [34]:
print(pipelinemodel2.score(X_train, y_train))
print(pipelinemodel2.score(X_test, y_test))

0.9384735150317042
0.7557978429266149


## <font color = "E38B29"> pipeline model using boosting




In [35]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Lasso
base_estimator = DecisionTreeRegressor(max_depth=5)
ada_boost = AdaBoostRegressor(base_estimator = base_estimator, n_estimators = 10, random_state = 1)

In [36]:
pipelinemodel3 = Pipeline(steps=[
    ('pre',preprocessing),
    ('est',ada_boost)
])

In [37]:
from sklearn import set_config
set_config(display='diagram')
pipelinemodel3.fit(X_train, y_train)

In [38]:
 ypred3 = pipelinemodel3.predict(X_test)

In [39]:
mae = mean_absolute_error(y_test, ypred3)
mse = mean_squared_error(y_test, ypred3)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ypred3)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 13417.618642737472
Mean Squared Error: 317039216.9585812
Root Mean Squared Error: 17805.595102623814
R-squared: 0.727753202822508


In [40]:
print(pipelinemodel3.score(X_train, y_train))
print(pipelinemodel3.score(X_test, y_test))

0.839535963687277
0.727753202822508


In [41]:
import pickle

pickle.dump(pipelinemodel1, open('model.pkl', 'wb'))