# 1. Load dataset

#### In this notebook, we are building pipeline, doing k-fold cross validation and performing hyperparameter tuning.
#### Dataset: mobile phone dataset (mobile_train.csv).

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [2]:
df = pd.read_csv('mobile_train.csv')

In [3]:
# Split the dataset into train and test
X = df.drop('price_range', axis=1)
Y = df['price_range']
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

# 2. Tuning

#### Build a random forest classifier model and perform hyperparameter tuning using grid search. Also applying 5-fold cross validation while doing searching. Following values will be used for the tuning:
- n_estimators - 100, 200, 300
- max_depth - 5, 7, 9, 11, 13
- criterion - gini, entropy

In [4]:
params = {'n_estimators': [100, 200, 300], 'max_depth': [5, 7, 9, 11, 13], 'criterion': ['gini', 'entropy']}

clf = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=5)

In [5]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 7, 9, 11, 13],
                         'n_estimators': [100, 200, 300]})

#### Get the best score and optimal values for hyperparameters.

In [6]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 13, 'n_estimators': 200}

In [7]:
clf.score(X_test, y_test)

0.88

#### Inspecting the values and checking if we could do better with more parameters

In [8]:
# Best parameters: n_estimators: 500, max_depth: unknown yet, criterion: entropy
params_best = {'n_estimators': [300, 400, 500, 500], 'max_depth': [13, 15, 17, 19, 21, 23, 25], 'criterion': ['gini', 'entropy']}

clf_best = GridSearchCV(RandomForestClassifier(), param_grid=params_best, cv=5)

In [9]:
# NOTE: This takes more than 2-3 min
clf_best.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [13, 15, 17, 19, 21, 23, 25],
                         'n_estimators': [300, 400, 500, 500]})

In [11]:
clf_best.best_params_

{'criterion': 'entropy', 'max_depth': 23, 'n_estimators': 400}

In [12]:
clf_best.score(X_test, y_test)

0.8775

#### Perform hyperparameter tuning using random search. Increase number of iterations if needed

In [13]:
clf = RandomizedSearchCV(RandomForestClassifier(), params_best, cv=5, random_state=42)

In [14]:
clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [13, 15, 17, 19, 21, 23,
                                                      25],
                                        'n_estimators': [300, 400, 500, 500]},
                   random_state=42)

In [15]:
clf.best_params_

{'n_estimators': 300, 'max_depth': 21, 'criterion': 'entropy'}

In [16]:
clf.score(X_test, y_test)

0.8825

#### Create a pipeline and add standard scaling and dimensionality reduction. We will use StandardScaler and PCA as well as performing tuning by random search. Now we have to provide values for hyperparameters of different components of your pipeline.  Tried to isolate and check the effect of scaling and dimensionality reduction on the model.

**Pipeline with StandardScalar, PCA and RandomForestClassifier**

In [17]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('random_forest', RandomForestClassifier())
    ]
)

In [18]:
# If multiple preprocessing is done inside pipeline, the hyperparameters should be 
# given as: <estimator_name>__<hyper_parameter>
pipe_params = {
    'pca__n_components': [5, 7, 9, 11, 13, 15, 17, 19],
    'random_forest__n_estimators': [300, 400, 500, 600], 
    'random_forest__max_depth': [13, 15, 17, 19, 21, 23, 25],
    'random_forest__criterion': ['gini', 'entropy']
}

pipe_clf = RandomizedSearchCV(pipe, pipe_params, cv=5)

In [19]:
pipe_clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('random_forest',
                                              RandomForestClassifier())]),
                   param_distributions={'pca__n_components': [5, 7, 9, 11, 13,
                                                              15, 17, 19],
                                        'random_forest__criterion': ['gini',
                                                                     'entropy'],
                                        'random_forest__max_depth': [13, 15, 17,
                                                                     19, 21, 23,
                                                                     25],
                                        'random_forest__n_estimators': [300,
                                                                        400,

In [20]:
pipe_clf.best_params_

{'random_forest__n_estimators': 300,
 'random_forest__max_depth': 19,
 'random_forest__criterion': 'gini',
 'pca__n_components': 17}

In [21]:
pipe_clf.score(X_test, y_test)

0.6975

**Pipeline with only StandardScalar and RandomForestClassifier**

In [22]:
pipe_without_pca = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('random_forest', RandomForestClassifier())
    ]
)

In [23]:
pipe_without_pca_params = {
    'random_forest__n_estimators': [300, 400, 500, 600], 
    'random_forest__max_depth': [13, 15, 17, 19, 21, 23, 25],
    'random_forest__criterion': ['gini', 'entropy']
}

pipe_without_pca_clf = RandomizedSearchCV(pipe, pipe_without_pca_params, cv=5)

In [24]:
pipe_without_pca_clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('random_forest',
                                              RandomForestClassifier())]),
                   param_distributions={'random_forest__criterion': ['gini',
                                                                     'entropy'],
                                        'random_forest__max_depth': [13, 15, 17,
                                                                     19, 21, 23,
                                                                     25],
                                        'random_forest__n_estimators': [300,
                                                                        400,
                                                                        500,
                                                                        600]

In [25]:
pipe_without_pca_clf.best_params_

{'random_forest__n_estimators': 500,
 'random_forest__max_depth': 15,
 'random_forest__criterion': 'gini'}

In [26]:
pipe_without_pca_clf.score(X_test, y_test)

0.6925

**Pipeline with only PCA and RandomForestClassifier**

In [27]:
pipe_without_scalar = Pipeline(
    [
        ('pca', PCA()),
        ('random_forest', RandomForestClassifier())
    ]
)

In [28]:
pipe_without_scalar_params = {
    'pca__svd_solver': ['auto', 'full', 'arpack', 'randomized'],
    'pca__n_components': [5, 7, 9, 11, 13, 15, 17, 19],
    'random_forest__n_estimators': [300, 400, 500, 600], 
    'random_forest__max_depth': [13, 15, 17, 19, 21, 23, 25],
    'random_forest__criterion': ['gini', 'entropy']
}

pipe_without_scalar_clf = RandomizedSearchCV(pipe, pipe_without_scalar_params, cv=5)

In [29]:
pipe_without_scalar_clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('random_forest',
                                              RandomForestClassifier())]),
                   param_distributions={'pca__n_components': [5, 7, 9, 11, 13,
                                                              15, 17, 19],
                                        'pca__svd_solver': ['auto', 'full',
                                                            'arpack',
                                                            'randomized'],
                                        'random_forest__criterion': ['gini',
                                                                     'entropy'],
                                        'random_forest__max_depth': [13, 15, 17,
                                                                     19, 21, 23,
      

In [30]:
pipe_without_scalar_clf.best_params_

{'random_forest__n_estimators': 600,
 'random_forest__max_depth': 15,
 'random_forest__criterion': 'entropy',
 'pca__svd_solver': 'arpack',
 'pca__n_components': 17}

In [31]:
pipe_without_scalar_clf.score(X_test, y_test)

0.68

## Conclusion

1. Model performed best when no preprocessing done, such as PCA and Scaling (score: 0.8825)
2. Model performed worst when only Scaling done before Classification (score: 0.6925)
3. Model with both Scaling and PCA did slightly better than that of PCA only (score: 0.6975, 0.68, respectively)