In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [121]:
df = pd.read_csv(r"/content/Cleaned Smart-Watches data.csv")

In [122]:
df.head()

Unnamed: 0,Brand,Model,Operating System,Bluetooth,Wi-Fi,Cellular,Display Type,Display Size (inches),Resolution,Water Resistance (meters),Battery Life (days),Heart Rate Monitor,GPS,NFC,Price (INR)
0,Apple,Watch Series 7,Watch OS,1,1,1,Retina,1.9,396 x 484,50.0,18.0,1,1,1,33117
1,Samsung,Galaxy Watch 4,Wear OS,1,1,1,AMOLED,1.4,450 x 450,50.0,40.0,1,1,1,20667
2,Garmin,Venu 2,Garmin OS,1,1,0,AMOLED,1.3,416 x 416,50.0,11.0,1,1,0,33117
3,Fitbit,Versa 3,Fitbit OS,1,1,0,AMOLED,1.58,336 x 336,50.0,6.0,1,1,1,19007
4,Fossil,Gen 6,Wear OS,1,1,0,AMOLED,1.28,416 x 416,30.0,24.0,1,1,1,24817


# Model Building and Testing

In [123]:
X = df.drop(columns = 'Price (INR)')
y = df['Price (INR)']

In [124]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [125]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet

In [126]:
df_encod = df.copy()

object_features = df_encod.select_dtypes(include = "object").columns.tolist()
print(len(object_features))
print(object_features)

5
['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']


In [127]:
ohe = OneHotEncoder()
ohe.fit(X[['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']])

In [128]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']),
                                       remainder = 'passthrough')

In [129]:
en = ElasticNet()

In [130]:
pipe = make_pipeline(column_trans,en)

In [131]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [132]:
y_pred = pipe.predict(X_test)

In [133]:
y_pred

array([23271.93137844, 22594.69113964, 23535.45871827, 32151.81208393,
       23955.61439871, 26905.40164947, 32683.60623621, 20081.36324817,
       22716.67423872, 26046.83383336, 21553.73864254, 26896.05932153,
       23876.38619075, 20180.65563759, 21076.1723443 , 29001.19183799,
       27062.00450143, 24522.58280435, 26913.1403561 , 30392.12263299,
       20016.22633548, 24837.08466883, 27015.96891811, 36909.59799639,
       23001.67515798, 26913.1403561 , 23768.8903797 , 22589.35181801,
       35800.4064193 , 24997.09629684, 24625.33805712, 30544.91447944,
       23535.45871827, 23313.06528901, 23082.11343275, 22685.14498708,
       23265.68561631, 23372.38695331, 43410.44483224, 23265.68561631,
       25008.61891664, 23172.92328252, 31706.83623464, 21947.01819298,
       27101.63984216, 21714.42049949, 25513.26376857, 23116.59905846,
       25160.18583046, 25746.75664778, 21294.65096609, 22062.32802639,
       23023.0953874 , 21714.42049949, 24979.48194544, 21530.56014447,
      

In [134]:
r2_score(y_test, y_pred)

0.34421010281668873

### For improving the R2_score of this model

In [135]:
# Import necessary libraries
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import uniform

# OneHotEncode and Column Transformer (Already Defined)
ohe = OneHotEncoder()
ohe.fit(X[['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']])

column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_, sparse_output=True), ['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']),
    remainder='passthrough'
)

# StandardScaler (with_mean=False to handle sparse matrices)
scaler = StandardScaler(with_mean=False)

# ElasticNet model
en = ElasticNet()

# Create a pipeline that combines preprocessing and the model
pipe = make_pipeline(column_trans, scaler, en)

# Parameter grid for RandomizedSearchCV
param_distributions = {
    'elasticnet__alpha': uniform(0.001, 100),  # Randomly search alpha between 0.001 and 100
    'elasticnet__l1_ratio': uniform(0, 1)      # Randomly search l1_ratio between 0 and 1
}

# RandomizedSearchCV for ElasticNet within the pipeline
random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=100,            # Number of iterations for hyperparameter search
    scoring='r2',          # Scoring based on R²
    cv=5,                  # 5-fold cross-validation
    verbose=2,
    random_state=42,       # For reproducibility
    n_jobs=-1              # Utilize all available cores
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Predict with the best model from RandomizedSearchCV
y_pred = random_search.predict(X_test)

# Evaluate the performance
print("R² Score:", r2_score(y_test, y_pred))

# Print best hyperparameters
print("Best Params from RandomizedSearchCV:", random_search.best_params_)

# Get cross-validated R² scores for the best model
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=5, scoring='r2')
print("Cross-validated R² Score:", cv_scores.mean())


Fitting 5 folds for each of 100 candidates, totalling 500 fits
R² Score: 0.8723681490178629
Best Params from RandomizedSearchCV: {'elasticnet__alpha': 7.456064367977082, 'elasticnet__l1_ratio': 0.9868869366005173}
Cross-validated R² Score: 0.6892376007426172


In [140]:
# Step 1: Import necessary libraries
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score

# Step 2: Define OneHotEncoder and fit on categorical columns
ohe = OneHotEncoder()
ohe.fit(X[['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']])

# Step 3: Create the ColumnTransformer
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['Brand', 'Model', 'Operating System', 'Display Type', 'Resolution']),
    remainder='passthrough'
)

# Step 4: Define the ElasticNet model with best parameters
best_alpha = 7.456064367977082
best_l1_ratio = 0.9868869366005173
en_best = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)

# Step 5: Create the pipeline including the ColumnTransformer and ElasticNet
pipe_best = make_pipeline(column_trans, StandardScaler(with_mean=False), en_best)

# Step 6: Fit the pipeline on the training data
pipe_best.fit(X_train, y_train)

# Step 7: Make predictions on the test set
y_pred_best = pipe_best.predict(X_test)

# Step 8: Evaluate the performance
r2 = r2_score(y_test, y_pred_best)
print("Final R² Score with Best Parameters:", r2)

Final R² Score with Best Parameters: 0.8723681490178629


In [142]:
y_pred

array([ 22068.46569422,  22296.24582716,  25841.38597393, 126387.23903229,
        16595.63699068,  30321.6175884 ,  39459.25307799,  16517.97259061,
        24437.15088966,  31213.46598163,  24525.18184768,  23356.51331484,
        23495.9057507 ,  10638.21279246,  15028.62726991,  35607.30491588,
        23471.39638186,  22094.97416257,  17256.29403755,  49170.42820991,
        20784.96558446,  40009.31486538,  23709.86631132,  61687.13349638,
        14717.59649362,  17256.29403755,  19155.43365406,  22006.4980748 ,
        51874.33335396,  27944.92577957,  24424.37279205,  18604.45393771,
        25841.38597393,  23419.54333285,  23727.52989838,  24944.28161865,
        18260.64366645,  25959.42733584,  64811.10023864,  18260.64366645,
        14473.20784716,  22046.14158361,  48362.81945079,  14187.80971347,
        25264.29534727,  12428.7761677 ,  40064.61524349,  24061.8716477 ,
        25449.42849192,  23528.40990311,  22256.75421547,  12543.50093869,
        16535.78515685,  

In [143]:
r2_score(y_test, y_pred)

0.8723681490178629

In [144]:
import pickle

In [145]:
pickle.dump(pipe, open('smart-watch_en_model_r2_872_v1.pkl', 'wb'))