All in 1 using sklearn pipline

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
#Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(7)

data = pd.read_csv('Cell_data.csv')
data = data.drop('Unnamed: 0' , axis=1)
data.dropna(subset = ['Price'], inplace = True)
data['Price']= data['Price'].str.replace('$','')
data['Price']= data['Price'].str.replace(',','')
data['Price']= data['Price'].astype(int)
#Define different features and Transformer pipline
cat_features         = ['Name','Brand','Model','Touchscreen','Operating system','Wi-Fi','Bluetooth','GPS','3G','4G/ LTE']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('onehot', OneHotEncoder(handle_unknown= 'ignore'))
])
###################111111111111111##########################
Battery_features     = ['Battery capacity (mAh)']
Battery_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 2500 ) )
###########################################################
])
Screen_features      = ['Screen size (inches)']
Screen_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'mean') )
])
##########################################################
Resolutionx_features = ['Resolution x']
Resolutionx_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 980 ) )
])
##############################################################
Resolutiony_features = ['Resolution y']
Resolutiony_transformer  =  Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 1290 ) )
]) 
######################################################################
Processor_features   =  ['Processor'] 
Processor_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 1950 ) )
])
###############################################################
RAM_features         =  ['RAM (MB)']
RAM_transformer =  Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 3500 ) )
])
################################################################
Internal_features    =  ['Internal storage (GB)']
Internal_transformer =  Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 128 ) )
])
################################################################
Rear_features        =  ['Rear camera'] 
Rear_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 48 ) )
])
################################################################
Front_features       =  ['Front camera']
Front_transformer =  Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value = 24 ) )
])
##################################################################
SIMs_features        =  ['Number of SIMs']
SIMs_transformer  =  Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value =2 ) )
])
#################################################################

#setup the preprocessing setup (fill the missing values, convert to numbers)
preprocessing = ColumnTransformer(
    transformers = [
        ('cat', cat_transformer, cat_features),
        ('Battery', Battery_transformer, Battery_features),
        ('Screen',Screen_transformer,Screen_features),
        ('Resolutionx',Resolutionx_transformer,Resolutiony_features),
        ('Resolutiony',Resolutiony_transformer, Resolutiony_features ),
        ('Processor',Processor_transformer,Processor_features),
        ('RAM',RAM_transformer, RAM_features),
        ('Internal',Internal_transformer,Internal_features),
        ('Rear',Rear_transformer,Rear_features),
        ('Front',Front_transformer, Front_features),
        ('SIMs', SIMs_transformer,SIMs_features)
    ]
    
)
model = Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('model', RandomForestRegressor())
])

#split data
x= data.drop('Price', axis=1)
y= data['Price']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20)
#now run the fit and evalute evalute model
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.4891705441389175

In [87]:
#Tuning Hyper parametrs using GridSearchCV by Pipeline

hparams3 = {
    'preprocessing__Screen__imputer__strategy':['mean', 'median'],
    'model__max_depth' :[10,20],
    'model__n_estimators' : [100, 500,1000],
    'model__min_samples_split': [2,4],
    'model__min_samples_leaf': [1,2],
    'model__max_features': ['auto','sqrt']
}
my_gscv_model = GridSearchCV(model, hparams3, cv=5, verbose=2)
my_gscv_model.fit(x_train,y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__Screen__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__Screen__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__Screen__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__Screen__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators

240 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Usama Zafar\Desktop\ml_project\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Usama Zafar\Desktop\ml_project\venv\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usama Zafar\Desktop\ml_project\venv\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "

In [88]:
my_gscv_model.score(x_test, y_test)

0.5725433861789126