In [304]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error

import scipy.stats as stats
from scipy.stats import kruskal

from category_encoders import TargetEncoder

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

import optuna
import pickle

In [217]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [218]:
df = pd.read_csv("smartphones_outliers_treated.csv")

In [219]:
df.sample(5)

Unnamed: 0,price,rating,spec_score,processor_brand,processor_core,clock_speed,has_ir,has_nfc,has_5g,ram_gb,rom_gb,brand,battery_capacity_mah,fast_charging_watt,screen_size_inch,foldable_display,display_refresh_rate,ppi,num_rear_cameras,num_front_cameras,rear_primary_mp,front_primary_mp,expandable_memory_supported,expandable_memory_limit,screen_type
1466,14498.0,4.1,81,snapdragon,octa,2.3,1,0,0,6.0,128.0,Xiaomi,5020.0,33.0,6.67,0,120.0,394.57,4,1,64.0,16.0,1,512.0,SUPER AMOLED
7,49999.0,4.0,89,snapdragon,octa,3.2,1,1,1,12.0,256.0,Meizu,4800.0,65.0,6.79,0,120.0,511.46,3,1,50.0,32.0,0,0.0,OLED
1377,24990.0,4.05,82,snapdragon,octa,2.2,0,0,1,8.0,256.0,Oppo,5000.0,67.0,6.72,0,120.0,391.64,3,1,64.0,32.0,1,1024.0,LCD
2885,17999.0,4.75,81,snapdragon,octa,2.8,0,0,0,6.0,128.0,Honor,5000.0,0.0,6.7,0,90.0,394.44,2,1,108.0,50.0,0,0.0,AMOLED
2220,7990.0,4.35,57,unisoc,octa,1.6,0,0,0,2.0,64.0,Nokia,4000.0,0.0,6.3,0,90.0,278.5,1,1,8.0,5.0,1,256.0,LCD


In [220]:
df.drop_duplicates(inplace = True)

In [221]:
df[df['processor_core'] == 'nine']

Unnamed: 0,price,rating,spec_score,processor_brand,processor_core,clock_speed,has_ir,has_nfc,has_5g,ram_gb,rom_gb,brand,battery_capacity_mah,fast_charging_watt,screen_size_inch,foldable_display,display_refresh_rate,ppi,num_rear_cameras,num_front_cameras,rear_primary_mp,front_primary_mp,expandable_memory_supported,expandable_memory_limit,screen_type
3212,47999.0,4.1,78,tensor,nine,3.0,0,1,1,8.0,256.0,Google,4575.0,30.0,6.2,0,120.0,424.48,2,1,50.0,5.0,0,0.0,OLED
3235,63999.0,4.5,77,tensor,nine,3.0,0,1,1,8.0,128.0,Google,4575.0,30.0,6.2,0,120.0,424.48,2,1,50.0,5.0,0,0.0,OLED


In [222]:
# Removing the above two rows as there are two phones only.
df.drop(index = [3212, 3235], inplace = True)

In [223]:
# Make the spec_score column a categorical one by quantile binning
df['spec_score_cat'] = pd.qcut(df['spec_score'], q = 3, labels = ['Low', 'Mid', 'High'])

In [224]:
print(df['spec_score'].quantile(0.33), df['spec_score'].quantile(0.66), df['spec_score'].max())

74.0 83.0 99


In [225]:
df['rating'].describe()

count    4055.000000
mean        4.357719
std         0.232680
min         3.000000
25%         4.150000
50%         4.350000
75%         4.550000
max         4.750000
Name: rating, dtype: float64

In [226]:
bins = [0, 4.15, 4.55, 5]
labels = ['Low', 'Medium', 'High']
df['rating_category'] = pd.cut(df['rating'], bins=bins, labels=labels, right=False)

In [227]:
df['rating_category'].value_counts()

rating_category
Medium    2072
High      1156
Low        827
Name: count, dtype: int64

In [228]:
# Checking the relationship bewtween rating category and price

In [229]:
for category in df['rating_category'].unique():
    data = df[df["rating_category"] == category]['price']

    stat, p_value = stats.shapiro(data)
    print(f"score: {category} - Shapiro-Wilk Test p-value: {p_value}")

score: Medium - Shapiro-Wilk Test p-value: 3.1031902408325807e-53
score: Low - Shapiro-Wilk Test p-value: 3.920217479686296e-38
score: High - Shapiro-Wilk Test p-value: 9.153676858495966e-43


In [230]:
# We can see above that assumptions of ANOVA is violated. So, let's apply kruskal test to check relationship between rating_category and price.

In [231]:
grouped_prices = [df[df['rating_category'] == category]['price'].to_list() for category in df['rating_category'].unique()]
stat, p_value = kruskal(*grouped_prices)
print("Kruskal-Wallis Test p-value:", p_value)

Kruskal-Wallis Test p-value: 0.019470250908200735


In [232]:
# We can see that there is significant relationship between price and rating_category.

In [233]:
# dropping the columns - rating and spec_score

In [234]:
df.drop(columns = ['rating', 'spec_score'], inplace = True)

In [235]:
df.columns

Index(['price', 'processor_brand', 'processor_core', 'clock_speed', 'has_ir',
       'has_nfc', 'has_5g', 'ram_gb', 'rom_gb', 'brand',
       'battery_capacity_mah', 'fast_charging_watt', 'screen_size_inch',
       'foldable_display', 'display_refresh_rate', 'ppi', 'num_rear_cameras',
       'num_front_cameras', 'rear_primary_mp', 'front_primary_mp',
       'expandable_memory_supported', 'expandable_memory_limit', 'screen_type',
       'spec_score_cat', 'rating_category'],
      dtype='object')

In [236]:
X = df.drop(columns = 'price')
y = df['price']

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Model selection using all the features

### Testing different models after encoding the categorical columns using target encoding only.

In [174]:
cols =  ['processor_brand', 'processor_core', 'brand', 'screen_type', 'spec_score_cat', 'rating_category']

In [175]:
preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), cols)
], remainder = 'passthrough')

In [176]:
def test_model(preprocessor, model):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
    print("Average R2 Score:", scores.mean())
    print("Standard Deviaton of scores:", scores.std())

In [177]:
test_model(preprocessor, DecisionTreeRegressor())

Average R2 Score: 0.7548168870922597
Standard Deviaton of scores: 0.05103646688743695


In [178]:
test_model(preprocessor, RandomForestRegressor())

Average R2 Score: 0.8543800574578151
Standard Deviaton of scores: 0.038630452234504994


In [179]:
test_model(preprocessor, XGBRegressor())

Average R2 Score: 0.8579676483021
Standard Deviaton of scores: 0.0293011564445927


In [180]:
test_model(preprocessor, GradientBoostingRegressor())

Average R2 Score: 0.8542565201006447
Standard Deviaton of scores: 0.034066332572567165


In [181]:
# Now, we are going to try SVR with 'rbf' kernel as there is high multicollinearity among the columns of our data. So we can't use linear kernel.
# But we have to transform our target column i.e., price first and also, scale our data.
# We will use TransformedTargetRegressor class under our main pipeline to automate the transformation of target column.
# The main advantage of using the TransformedTargetRegressor class is that the pipeline will predict and give the actual price, rather than the transformed price.

In [182]:
preprocessor_svr = Pipeline([
    ('target_encoding', ColumnTransformer([
        ('target_encoder', TargetEncoder(), cols)
    ], remainder = 'passthrough')),
    ('scaler', StandardScaler())
])

In [183]:
pipeline = Pipeline([
    ('preprocessor', preprocessor_svr),
    ('model', TransformedTargetRegressor(
        regressor = SVR(kernel = 'rbf'),
        transformer = PowerTransformer()
    ))
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
print("Average R2 Score:", scores.mean())
print("Standard Deviaton of scores:", scores.std())

Average R2 Score: 0.8039342526764427
Standard Deviaton of scores: 0.04826712939540048


### Testing different models after encoding the categorical columns using target encoding, ordinal encoding and OneHotEncoding.

In [184]:
preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), ['brand', 'processor_brand']),
    ('ordinal_encoder', OrdinalEncoder(categories = [['Low', 'Mid', 'High'], ['Low', 'Medium', 'High']]), ['spec_score_cat', 'rating_category']),
    ('ohe', OneHotEncoder(drop = 'first', sparse_output = False, dtype = np.int32), ['processor_core', 'screen_type'])
], remainder = 'passthrough')

In [185]:
test_model(preprocessor, DecisionTreeRegressor())

Average R2 Score: 0.7536577822827105
Standard Deviaton of scores: 0.06290972678650214


In [186]:
test_model(preprocessor, RandomForestRegressor())

Average R2 Score: 0.8560865064075998
Standard Deviaton of scores: 0.03919832221657769


In [187]:
test_model(preprocessor, XGBRegressor())

Average R2 Score: 0.8561412489098004
Standard Deviaton of scores: 0.03252761944210712


In [188]:
test_model(preprocessor, GradientBoostingRegressor())

Average R2 Score: 0.8548233808636281
Standard Deviaton of scores: 0.03483839911080003


In [189]:
preprocessor_svr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

In [190]:
pipeline = Pipeline([
    ('preprocessor', preprocessor_svr),
    ('model', TransformedTargetRegressor(
        regressor = SVR(kernel = 'rbf'),
        transformer = PowerTransformer()
    ))
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
print("Average R2 Score:", scores.mean())
print("Standard Deviaton of scores:", scores.std())

Average R2 Score: 0.7964734918766324
Standard Deviaton of scores: 0.04388391254419991


In [191]:
X_train.head()

Unnamed: 0,processor_brand,processor_core,clock_speed,has_ir,has_nfc,has_5g,ram_gb,rom_gb,brand,battery_capacity_mah,fast_charging_watt,screen_size_inch,foldable_display,display_refresh_rate,ppi,num_rear_cameras,num_front_cameras,rear_primary_mp,front_primary_mp,expandable_memory_supported,expandable_memory_limit,screen_type,spec_score_cat,rating_category
3602,unisoc,octa,1.8,0,1,0,4.0,128.0,Realme,5000.0,45.0,6.75,0,90.0,259.93,2,1,50.0,8.0,1,1024.0,LCD,Low,Medium
436,dimensity,octa,3.35,1,1,1,12.0,512.0,Oppo,5800.0,80.0,6.83,0,120.0,450.28,3,1,50.0,50.0,0,0.0,AMOLED,High,Medium
3485,dimensity,octa,2.6,0,0,1,8.0,256.0,Lava,5000.0,33.0,6.67,0,120.0,394.57,3,1,64.0,32.0,0,0.0,AMOLED,High,Medium
3651,unisoc,octa,1.8,0,0,0,4.0,64.0,Realme,5000.0,10.0,6.74,0,90.0,260.32,1,1,8.0,5.0,1,1024.0,LCD,Low,Medium
3975,snapdragon,octa,3.2,1,1,1,8.0,256.0,OnePlus,5500.0,100.0,6.78,0,120.0,450.42,3,1,50.0,16.0,0,0.0,AMOLED,High,High


# Model selection by dropping less important features

In [192]:
# Now, let's drop those features which had very less importances and see if this affects the performance of our models or not.

In [194]:
features_to_drop = [
    'expandable_memory_limit',
    'has_ir',
    'processor_core',
    'expandable_memory_supported',
    'rating_category',
    'num_front_cameras'
]

# Drop them from X_train and X_test
X_train = X_train.drop(columns = features_to_drop)
X_test = X_test.drop(columns = features_to_drop)

### Testing different models after encoding the categorical columns using target encoding only

In [195]:
cols =  ['processor_brand', 'brand', 'screen_type', 'spec_score_cat']

In [196]:
preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), cols)
], remainder = 'passthrough')

In [197]:
def test_model(preprocessor, model):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
    print("Average R2 Score:", scores.mean())
    print("Standard Deviaton of scores:", scores.std())

In [198]:
test_model(preprocessor, DecisionTreeRegressor())

Average R2 Score: 0.7617458309465082
Standard Deviaton of scores: 0.059192054663523036


In [199]:
test_model(preprocessor, RandomForestRegressor())

Average R2 Score: 0.856411320184761
Standard Deviaton of scores: 0.03776177958403921


In [200]:
test_model(preprocessor, XGBRegressor())

Average R2 Score: 0.8554775885916561
Standard Deviaton of scores: 0.02894281009790612


In [201]:
test_model(preprocessor, GradientBoostingRegressor())

Average R2 Score: 0.8545236169133595
Standard Deviaton of scores: 0.03476492942503379


In [202]:
preprocessor_svr = Pipeline([
    ('target_encoding', ColumnTransformer([
        ('target_encoder', TargetEncoder(), cols)
    ], remainder = 'passthrough')),
    ('scaler', StandardScaler())
])

In [203]:
pipeline = Pipeline([
    ('preprocessor', preprocessor_svr),
    ('model', TransformedTargetRegressor(
        regressor = SVR(kernel = 'rbf'),
        transformer = PowerTransformer()
    ))
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
print("Average R2 Score:", scores.mean())
print("Standard Deviaton of scores:", scores.std())

Average R2 Score: 0.8178201724018155
Standard Deviaton of scores: 0.035631575818977676


### Testing different models after encoding the categorical columns using target encoding, ordinal encoding and OneHotEncoding.

In [206]:
preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), ['brand', 'processor_brand']),
    ('ordinal_encoder', OrdinalEncoder(categories = [['Low', 'Mid', 'High']]), ['spec_score_cat']),
    ('ohe', OneHotEncoder(drop = 'first', sparse_output = False, dtype = np.int32), ['screen_type'])
], remainder = 'passthrough')

In [207]:
test_model(preprocessor, DecisionTreeRegressor())

Average R2 Score: 0.753952533638178
Standard Deviaton of scores: 0.048310616423494254


In [208]:
test_model(preprocessor, RandomForestRegressor())

Average R2 Score: 0.8572613206861763
Standard Deviaton of scores: 0.03853483523286355


In [209]:
test_model(preprocessor, XGBRegressor())

Average R2 Score: 0.8532320972345928
Standard Deviaton of scores: 0.033482100186888096


In [210]:
test_model(preprocessor, GradientBoostingRegressor())

Average R2 Score: 0.8535402014181308
Standard Deviaton of scores: 0.03592226727336741


In [211]:
preprocessor_svr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

In [212]:
pipeline = Pipeline([
    ('preprocessor', preprocessor_svr),
    ('model', TransformedTargetRegressor(
        regressor = SVR(kernel = 'rbf'),
        transformer = PowerTransformer()
    ))
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
print("Average R2 Score:", scores.mean())
print("Standard Deviaton of scores:", scores.std())

Average R2 Score: 0.810978286300655
Standard Deviaton of scores: 0.03558089340164385


# Conclusion
- After removing the less important features, the performance of each model either remained consistent or showed slight improvement.
- This indicates that a lower-dimensional dataset can be as effective as, or even more effective than, a higher-dimensional one.
- Additionally, the performance of all models using only Target Encoding for categorical columns was comparable to the performance achieved using a combination of One-Hot Encoding, Ordinal Encoding, and Target Encoding.
- Therefore, Target Encoding will be preferred for categorical variables, as it results in a lower-dimensional and more efficient dataset without compromising performance.
- Among the models tested, Random Forest, XGBoost, and Gradient Boosting consistently demonstrated similar levels of performance. Hence, hyperparameter tuning will be conducted for all three models to further optimize their performance.

In [238]:
df = df.drop(columns = features_to_drop)

In [239]:
df.sample(5)

Unnamed: 0,price,processor_brand,clock_speed,has_nfc,has_5g,ram_gb,rom_gb,brand,battery_capacity_mah,fast_charging_watt,screen_size_inch,foldable_display,display_refresh_rate,ppi,num_rear_cameras,rear_primary_mp,front_primary_mp,screen_type,spec_score_cat
1086,14994.0,snapdragon,2.0,0,1,8.0,128.0,Vivo,5000.0,18.0,6.58,0,90.0,401.08,2,48.0,8.0,LCD,Mid
3679,11499.0,dimensity,2.4,1,1,4.0,128.0,Samsung,5000.0,25.0,6.7,0,90.0,384.66,3,50.0,13.0,SUPER AMOLED,Mid
2771,7990.0,helio,2.3,0,0,4.0,64.0,Motorola,5000.0,10.0,6.5,0,90.0,269.93,2,50.0,8.0,LCD,Low
3172,34998.0,dimensity,2.8,1,1,12.0,512.0,Xiaomi,5000.0,120.0,6.67,0,120.0,445.84,3,200.0,16.0,AMOLED,High
2456,32990.0,snapdragon,3.2,1,1,12.0,256.0,Honor,5800.0,33.0,6.78,0,120.0,430.55,2,108.0,8.0,AMOLED,High


In [242]:
X = df.drop(columns = 'price')
y = df['price']

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Hyperparameter tuning of XGBoost

In [288]:
cols =  ['processor_brand', 'brand', 'screen_type', 'spec_score_cat']

preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), cols)
], remainder = 'passthrough')

X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'tree_method': 'auto',
        'objective': 'reg:squarederror',
        'random_state': 42,
    }

    model = XGBRegressor(**params)
    model.fit(X_train_transformed, y_train)

    preds = model.predict(X_test_transformed)
    r2 = r2_score(y_test, preds)

    return r2

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective, n_trials=50)

print("Best trial:")
print(f"  Params: {study_xgb.best_trial.params}")
print(f"  R² score: {study_xgb.best_value}")

[I 2025-04-26 18:18:04,909] A new study created in memory with name: no-name-840c81e0-6098-487d-8df8-c32a11e25214
[I 2025-04-26 18:18:05,423] Trial 0 finished with value: 0.854986734442029 and parameters: {'n_estimators': 518, 'max_depth': 10, 'learning_rate': 0.011226116782546198, 'subsample': 0.9047576059554648, 'colsample_bytree': 0.9489439174926683, 'gamma': 3.8603167699615604, 'reg_alpha': 0.2289457280148379, 'reg_lambda': 0.652800141669396, 'min_child_weight': 10}. Best is trial 0 with value: 0.854986734442029.
[I 2025-04-26 18:18:05,649] Trial 1 finished with value: 0.8506645069474075 and parameters: {'n_estimators': 766, 'max_depth': 4, 'learning_rate': 0.21014700048066756, 'subsample': 0.5506739747845623, 'colsample_bytree': 0.4151571136765115, 'gamma': 3.340884766168407, 'reg_alpha': 0.0059756935550785055, 'reg_lambda': 0.789777137657185, 'min_child_weight': 5}. Best is trial 0 with value: 0.854986734442029.
[I 2025-04-26 18:18:05,819] Trial 2 finished with value: 0.859901893

Best trial:
  Params: {'n_estimators': 271, 'max_depth': 9, 'learning_rate': 0.04797380284223506, 'subsample': 0.846307927391529, 'colsample_bytree': 0.6252424928201546, 'gamma': 4.3050954528712, 'reg_alpha': 0.09154105086066389, 'reg_lambda': 0.010837945807626728, 'min_child_weight': 4}
  R² score: 0.8858425300069908


In [289]:
best_xgb_model = XGBRegressor(**study.best_trial.params)

In [290]:
def test_model(model):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
    print("Average R2 Score:", scores.mean())
    print("Standard Deviaton of scores:", scores.std())

In [291]:
test_model(best_xgb_model)

Average R2 Score: 0.8650689516861361
Standard Deviaton of scores: 0.03695960457483207


In [292]:
def get_mae_mape(model):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model),
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100  # Mean Absolute Percentage Error
    print(f"MAE: ₹{mae:.2f}")
    print(f"MAPE: {mape:.2f}%")

In [293]:
get_mae_mape(best_xgb_model)

MAE: ₹4762.19
MAPE: 19.02%


### Hyperparameter tuning of RandomForestRegressor

In [280]:
cols =  ['processor_brand', 'brand', 'screen_type', 'spec_score_cat']

preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), cols)
], remainder = 'passthrough')

X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train_transformed, y_train)

    preds = model.predict(X_test_transformed)
    r2 = r2_score(y_test, preds)

    return r2

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=50)

print("Best trial for Random Forest:")
print(f"  Params: {study_rf.best_trial.params}")
print(f"  R² score: {study_rf.best_value}")

[I 2025-04-26 18:11:53,512] A new study created in memory with name: no-name-86663e6c-7c95-4cd5-b5de-9f642028764f
[I 2025-04-26 18:11:58,807] Trial 0 finished with value: 0.7519459417626171 and parameters: {'n_estimators': 366, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 0.7519459417626171.
[I 2025-04-26 18:12:00,619] Trial 1 finished with value: 0.8650458536595053 and parameters: {'n_estimators': 514, 'max_depth': 14, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.8650458536595053.
[I 2025-04-26 18:12:01,885] Trial 2 finished with value: 0.843645309716202 and parameters: {'n_estimators': 415, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.8650458536595053.
[I 2025-04-26 18:12:03,103] Trial 3 finished with value: 0.8491324793175744 

Best trial for Random Forest:
  Params: {'n_estimators': 991, 'max_depth': 19, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}
  R² score: 0.8668889362705167


In [281]:
best_rf_model = RandomForestRegressor(**study_rf.best_trial.params)

In [282]:
test_model(best_rf_model)

Average R2 Score: 0.8647599571907325
Standard Deviaton of scores: 0.035176924180237


In [283]:
get_mae_mape(best_rf_model)

MAE: ₹4732.42
MAPE: 18.53%


### Hyperparameter tuning of GradientBoostingRegressor

In [284]:
cols =  ['processor_brand', 'brand', 'screen_type', 'spec_score_cat']

preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), cols)
], remainder = 'passthrough')

X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)

def objective_gb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42,
    }

    model = GradientBoostingRegressor(**params)
    model.fit(X_train_transformed, y_train)

    preds = model.predict(X_test_transformed)
    r2 = r2_score(y_test, preds)

    return r2

study_gb = optuna.create_study(direction='maximize')
study_gb.optimize(objective_gb, n_trials=50)

print("Best trial for Gradient Boosting:")
print(f"  Params: {study_gb.best_trial.params}")
print(f"  R² score: {study_gb.best_value}")

[I 2025-04-26 18:15:06,024] A new study created in memory with name: no-name-7ba8001c-24fc-4aea-88e6-2e4049ac43c8
[I 2025-04-26 18:15:11,191] Trial 0 finished with value: 0.8510084794365662 and parameters: {'n_estimators': 672, 'max_depth': 9, 'learning_rate': 0.13115130248354298, 'subsample': 0.9036887785655201, 'max_features': None}. Best is trial 0 with value: 0.8510084794365662.
[I 2025-04-26 18:15:13,477] Trial 1 finished with value: 0.8445176828377163 and parameters: {'n_estimators': 809, 'max_depth': 3, 'learning_rate': 0.19495620273313233, 'subsample': 0.8857188978003381, 'max_features': None}. Best is trial 0 with value: 0.8510084794365662.
[I 2025-04-26 18:15:13,752] Trial 2 finished with value: 0.86595981549852 and parameters: {'n_estimators': 236, 'max_depth': 3, 'learning_rate': 0.15065711545221488, 'subsample': 0.8319896416924288, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.86595981549852.
[I 2025-04-26 18:15:16,527] Trial 3 finished with value: 0.8435365229403

Best trial for Gradient Boosting:
  Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.10423378724954513, 'subsample': 0.9135222553291025, 'max_features': 'sqrt'}
  R² score: 0.8822356172737136


In [285]:
best_gb_model = GradientBoostingRegressor(**study_gb.best_trial.params)

In [286]:
test_model(best_gb_model)

Average R2 Score: 0.8662889706567432
Standard Deviaton of scores: 0.04030737167235898


In [287]:
get_mae_mape(best_gb_model)

MAE: ₹4768.96
MAPE: 18.93%


In [295]:
voting_model = VotingRegressor([
    ('rf', best_rf_model),
    ('xgb', best_xgb_model),
    ('gbr', best_gb_model)
])

In [296]:
test_model(voting_model)

Average R2 Score: 0.8716882035143154
Standard Deviaton of scores: 0.03623705154905132


In [297]:
get_mae_mape(voting_model)

MAE: ₹4624.98
MAPE: 18.42%


In [299]:
X_train.sample(5)

Unnamed: 0,processor_brand,clock_speed,has_nfc,has_5g,ram_gb,rom_gb,brand,battery_capacity_mah,fast_charging_watt,screen_size_inch,foldable_display,display_refresh_rate,ppi,num_rear_cameras,rear_primary_mp,front_primary_mp,screen_type,spec_score_cat
480,snapdragon,1.8,0,0,3.0,32.0,Samsung,5000.0,15.0,6.5,0,90.0,269.93,3,13.0,5.0,LCD,Low
2480,snapdragon,2.2,1,1,8.0,256.0,Motorola,5000.0,30.0,6.7,0,120.0,392.81,2,50.0,32.0,OLED,High
3643,snapdragon,4.32,1,1,12.0,256.0,Iqoo,6000.0,120.0,6.82,0,144.0,510.25,3,50.0,32.0,AMOLED,High
3883,dimensity,3.25,1,1,12.0,512.0,Poco,5500.0,100.0,6.67,0,144.0,513.44,3,50.0,32.0,OLED,High
3551,snapdragon,1.4,0,0,3.0,32.0,Huawei,4000.0,0.0,5.5,0,90.0,267.02,1,12.0,8.0,LCD,Low


In [303]:
voting_model = VotingRegressor([
    ('rf', best_rf_model),
    ('xgb', best_xgb_model),
    ('gbr', best_gb_model)
])

cols =  ['processor_brand', 'brand', 'screen_type', 'spec_score_cat']

preprocessor = ColumnTransformer(transformers = [
    ('target_encoder', TargetEncoder(), cols)
], remainder = 'passthrough')

pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', voting_model)
    ])

pipeline.fit(X, y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [1]:
with open('smartphone_price_predictor.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

NameError: name 'pickle' is not defined

In [308]:
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)