In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('laptop_data.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df['Ram'] = df['Ram'].str.replace('GB','').astype('int32')
df['Weight'] = df['Weight'].str.replace('kg','').astype('float32')

In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)

In [None]:
df['Ips'] = df['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)

In [None]:
new = df['ScreenResolution'].str.split('x',n=1,expand=True)
df['X_res'] = new[0]
df['Y_res'] = new[1]
df['X_res'] = df['X_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])

In [None]:
df['X_res'] = df['X_res'].astype('int')
df['Y_res'] = df['Y_res'].astype('int')
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')
df.drop(columns=['ScreenResolution'],inplace=True)

In [None]:
df.drop(columns=['Inches','X_res','Y_res'],inplace=True)
df['Cpu']

In [None]:
df['Cpu Name'] = df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))

In [None]:
def fetch_processor(text):
    if text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'

df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df['Cpu brand']

In [None]:
df.drop(columns=['Cpu','Cpu Name'],inplace=True)
df['Memory']

In [None]:
df.sample(3)

In [None]:
df['mem']=df['Memory'].str.split(' ')
def fetch_processor(text):
    types = []
    if 'SSD' in text:
        types.append('SSD')
    if 'HDD' in text:
        types.append('HDD')
    if 'Flash' in text:
        types.append('Flash Storage')
    if 'Hybrid' in text:
        types.append('Hybrid')
    return ' '.join(types)

df['_Memory'] = df['Memory'].apply(fetch_processor)
df

In [None]:
# df["Layer1HDD"] = df["_Memory"].apply(lambda x: 1 if "HDD" in x else 0)
# df["Layer1SSD"] = df["_Memory"].apply(lambda x: 1 if "SSD" in x else 0)
# df["Layer1Hybrid"] = df["_Memory"].apply(lambda x: 1 if "Hybrid" in x else 0)
# df["Layer1Flash_Storage"] = df["_Memory"].apply(lambda x: 1 if "Flash Storage" in x else 0)

In [None]:
df

In [None]:
def extract_storage(memory_str, storage_type):
    pattern = re.compile(r'(\d+)(GB|TB) ' + storage_type)
    matches = pattern.findall(memory_str)
    total_storage = 0
    for match in matches:
        size, unit = match
        size = int(size)
        if unit == 'TB':
            size *= 1024  # Convert TB to GB
        total_storage += size
    return total_storage

# Extract SSD, HDD, Flash Storage, and Hybrid Storage amounts
df['SSD'] = df['Memory'].apply(lambda x: extract_storage(x, 'SSD'))
df['HDD'] = df['Memory'].apply(lambda x: extract_storage(x, 'HDD'))
df['Flash Storage'] = df['Memory'].apply(lambda x: extract_storage(x, 'Flash Storage'))
df['Hybrid'] = df['Memory'].apply(lambda x: extract_storage(x, 'Hybrid'))


In [None]:
df.sample(1)

In [None]:
df.drop(columns=['Memory','mem','_Memory'],inplace=True)
df.sample(1)

In [None]:
df.drop(columns=['Hybrid','Flash Storage'],inplace=True)
df.sample(1)

In [None]:
df['Gpu brand'] = df['Gpu'].apply(lambda x:x.split()[0])

In [None]:
df['Gpu brand'].value_counts()

In [None]:
df = df[df['Gpu brand'] != 'ARM']

In [None]:
df.drop(columns=['Gpu'],inplace=True)

In [None]:
def cat_os(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'Windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

df['os'] = df['OpSys'].apply(cat_os)
df.drop(columns=['OpSys'],inplace=True)

In [None]:
X = df.drop(columns=['Price'])
y = np.log(df['Price'])

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=2)

## Linear Regression

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Ridge Regression

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = Ridge(alpha=10)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## LassoRegression

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = Lasso(alpha=0.001)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## KNN

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Decision Tree

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = DecisionTreeRegressor(max_depth=8)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## SVM

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = SVR(kernel='rbf',C=10000,epsilon=0.1)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Random Forest

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## ExtraTrees

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = ExtraTreesRegressor(n_estimators=100,random_state=3,max_features=0.75,max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Adaboost

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = AdaBoostRegressor(n_estimators=15,learning_rate=1.0)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Gradient Boost

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = GradientBoostingRegressor(n_estimators=500)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## XGBoost

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = XGBRegressor(n_estimators=45,max_depth=5,learning_rate=0.5)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor,StackingRegressor

step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')


rf = RandomForestRegressor(n_estimators=350,random_state=3,max_features=0.75,max_depth=15)
gbdt = GradientBoostingRegressor(n_estimators=100,max_features=0.5)
xgb = XGBRegressor(n_estimators=25,learning_rate=0.3,max_depth=5)
et = ExtraTreesRegressor(n_estimators=100,random_state=3,max_features=0.75,max_depth=10)

step2 = VotingRegressor([('rf', rf), ('gbdt', gbdt), ('xgb',xgb), ('et',et)],weights=[5,1,1,1])

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Stacking

In [None]:
from sklearn.ensemble import VotingRegressor,StackingRegressor

step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse_output=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')


estimators = [
    ('rf', RandomForestRegressor(n_estimators=350,random_state=3,max_features=0.75,max_depth=15)),
    ('gbdt',GradientBoostingRegressor(n_estimators=100,max_features=0.5)),
    ('xgb', XGBRegressor(n_estimators=25,learning_rate=0.3,max_depth=5))
]

step2 = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=100))

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred)*100)
print('MAE',mean_absolute_error(y_test,y_pred)*100)

## Exporting The Model

In [None]:
import pickle

pickle.dump(df,open('df.pkl','wb'))
pickle.dump(pipe,open('pipe.pkl','wb'))