In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [15]:
df = pd.read_csv(r"/content/cleaned_smartphone_data.csv")

In [16]:
df.head()

Unnamed: 0,Brand,Model,Front Camera (MP),Have Pop-Up Front Camera,Rear Camera (MP),Processor,Battery Capacity (mAh),Display Size (inches),RAM (GB),ROM (GB),Price (Rs.)
0,SAMSUNG,S24 Ultra,40.0,0,108.0,Exynos,5000,6.8,8.0,128,101515
1,SAMSUNG,S24 Plus,10.0,0,50.0,Exynos,4500,6.6,8.0,128,57575
2,SAMSUNG,Zero,32.0,0,12.0,Snapdragon 888 5G,4500,6.41,8.0,256,34242
3,SAMSUNG,S24,10.0,0,50.0,Exynos,3700,6.1,8.0,128,75757
4,SAMSUNG,A05s,5.0,0,50.0,Exynos,5000,6.5,4.0,64,7575


# Model Building and Testing

In [17]:
X = df.drop(columns = 'Price (Rs.)')
y = df['Price (Rs.)']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [19]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

In [20]:
df_encod = df.copy()

object_features = df_encod.select_dtypes(include = "object").columns.tolist()
print(len(object_features))
print(object_features)

3
['Brand', 'Model', 'Processor']


In [21]:
ohe = OneHotEncoder()
ohe.fit(X[['Brand', 'Model', 'Processor']])

In [22]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['Brand', 'Model', 'Processor']),
                                       remainder = 'passthrough')

In [23]:
xgbr = XGBRegressor()

In [24]:
pipe = make_pipeline(column_trans,xgbr)

In [25]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [26]:
y_pred = pipe.predict(X_test)

In [27]:
y_pred

array([ 61647.22  ,   9572.451 ,  15124.022 ,  23871.76  ,  18721.682 ,
        10811.185 ,  29694.723 ,  12041.449 ,   6507.8286,  23231.215 ,
        15235.904 ,  20152.395 ,  67090.305 ,  66085.836 ,  16971.34  ,
        25106.85  ,  17029.936 ,  20587.932 ,  26177.281 ,  19873.66  ,
        26031.818 ,  23548.244 ,  74841.125 ,  50357.555 ,  24689.99  ,
        25618.68  ,  18306.66  ,  23260.643 ,  20458.223 ,  17502.43  ,
        13886.838 ,  25957.754 ,  40761.805 ,  16786.588 ,  27083.527 ,
        51328.43  ,  23823.969 ,  13612.62  ,  16556.543 ,  73745.555 ,
        29536.584 ,  16672.207 ,  18014.49  ,   9572.451 , 106094.055 ,
        29889.342 ,  17045.678 ,  45119.95  ,  50400.176 ,  39822.46  ,
        79033.68  ,  24607.406 ,   7457.995 ,  33537.33  ,  19750.586 ,
        14652.139 ,  20426.643 ,  12958.322 ,  22786.64  ,  26849.77  ,
        18872.738 ,  15809.304 ,  54171.203 ,  39738.664 ,  18681.262 ,
        18281.59  ,   7826.641 ,  76432.49  ,  19896.424 ,  1201

In [28]:
r2_score(y_test, y_pred)

0.9051141738891602

In [29]:
import pickle

In [30]:
pickle.dump(pipe, open('smartphone_xgbr_model_r2_902_v1.pkl', 'wb'))