# Finding the best model that can predict as best as possible the output of pH, given the values of red, green and blue attributes

Ph recognition | donwloaded from : https://www.kaggle.com/robjan/ph-recognition

1. Here, common python packages are downloaded

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

2. In this section, the dataset is read. Is important to know that the labels column is sorted, so its necessary to shuffle it

In [None]:
df_ph = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/01_Development and codes/PH recognition | Kaggle/pH Data.csv')
df_ph_sh = df_ph.sample(frac=1, random_state=7).reset_index(drop=True)

display(df_ph.head(5))
print(df_ph.info())
print(df_ph.describe())

3. Here we split the data in the target to predict, and the features set, after that, the data is split between training and test set

In [None]:
X = df_ph_sh.drop('label', axis=1)
y = df_ph_sh['label']

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = 0.21, random_state=4)

# GRADIENT BOOSTING CLASSIFIER WITHOUT PREPROCESSING

In [None]:
GrBC_npp = GradientBoostingClassifier(learning_rate=1.5, n_estimators=2, max_depth=3, random_state=7, warm_start=True)  
GrBC_npp.fit(X_tr,y_tr)
y_pred_GrBC_npp = GrBC_npp.predict(X_ts)

print('Score on the TRAIN set: ', GrBC_npp.score(X_tr, y_tr))
print('Score on the TEST set: ', accuracy_score(y_ts, y_pred_GrBC_npp))
print('Feature importances: ' , GrBC_npp.feature_importances_)
#print('Train score: ' , GrBR_npp.train_score_)

Score on the TRAIN set:  0.5553398058252427
Score on the TEST set:  0.4492753623188406
Feature importances:  [0.30001758 0.44367292 0.2563095 ]


# GRADIENT BOOSTING CLASSIFIER WITH PREPROCESSING

In [None]:
GrBC_Pipeline = Pipeline([
                  ('Scale', StandardScaler()),
                  ('GrBC', GradientBoostingClassifier(learning_rate=1.5, n_estimators=2, max_depth=3, random_state=7, warm_start=True))
                ])

GrBC_Pipeline.fit(X_tr,y_tr)
y_pred_GrBC_Pipeline = GrBC_Pipeline.predict(X_ts)

print('Score on the TRAIN set: ', GrBC_Pipeline.score(X_tr,y_tr))
print('Score on the TEST set: ', accuracy_score(y_ts, y_pred_GrBC_Pipeline))

Score on the TRAIN set:  0.5553398058252427
Score on the TEST set:  0.4492753623188406


# GRADIENT BOOSTING CLASSIFIER WITH GRIDSEARCH (PREPROCESSED)

In [None]:
param_grid = {
              'GrBC__learning_rate': [0.1,0.105],
              'GrBC__n_estimators': [40,41],
              'GrBC__max_depth': [5,6]
} 

gridSCV = GridSearchCV(estimator=GrBC_Pipeline, param_grid=param_grid, scoring='accuracy', cv=10, refit=True, return_train_score=True)
gridSCV.fit(X_tr, y_tr)

#print(gridSCV.cv_results_)
print("Complete details of the model:  ",gridSCV.best_estimator_)
print("Best scorer:                    ",gridSCV.best_score_)
print("Best Hyperparameters choosen:   ",gridSCV.best_params_)
print("Best Index:                     ",gridSCV.best_index_)
print("Scorer:                         ",gridSCV.scorer_)
print("number of splits:               ",gridSCV.n_splits_)
print("refit_time:                     ",gridSCV.refit_time_)


BMGrid_GrBC_Pipeline = gridSCV.best_estimator_ # BM: best model
y_pred_GridBMGrBC = BMGrid_GrBC_Pipeline.predict(X_ts)

print('\n\n')
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print('Score on the TRAIN set:         ', BMGrid_GrBC_Pipeline.score(X_tr,y_tr))
print('Score on the TEST set:          ', accuracy_score(y_ts, y_pred_GridBMGrBC))

Complete details of the model:   Pipeline(memory=None,
         steps=[('Scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('GrBC',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.105,
                                            loss='deviance', max_depth=6,
                                            max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=40,
       

# GRADIENT BOOSTING REGRESSOR WITH CROSSVALIDATION (GRIDSEARCHED)

In [None]:
scores = cross_val_score(estimator=BMGrid_GrBC_Pipeline, X=X_tr, y=y_tr, cv=15, scoring='accuracy') 
                                                                                               
print('Scores: ')
for i in scores:
  print(i)
print('\n')  
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
print('\n')
print('Score on the TEST set:          ', accuracy_score(y_ts, BMGrid_GrBC_Pipeline.predict(X_ts)))

Scores: 
0.8857142857142857
0.8
0.7142857142857143
0.8571428571428571
0.7142857142857143
0.7647058823529411
0.7647058823529411
0.7647058823529411
0.6470588235294118
0.8235294117647058
0.7941176470588235
0.7058823529411765
0.7352941176470589
0.6470588235294118
0.5882352941176471


Mean: 0.7471148459383753
Standard deviation: 0.07846445880759877


Score on the TEST set:           0.7028985507246377


# GRADIENT BOOSTING REGRESSOR WITH PREPROCESSING, RANDOMSEARCH

In [None]:
param_grid = {
              'GrBC__learning_rate': [0.1,0.105,0.11],
              'GrBC__n_estimators': [39,40,41],
              'GrBC__max_depth': [6,7,8]
} 

randSCV = RandomizedSearchCV(estimator=GrBC_Pipeline, param_distributions=param_grid, n_iter=15, scoring='accuracy', cv=10, refit=True, return_train_score=True)
randSCV.fit(X_tr, y_tr)

#print(gridSCV.cv_results_)
print("Complete details of the model:  ",randSCV.best_estimator_)
print("Best scorer:                    ",randSCV.best_score_)
print("Best Hyperparameters choosen:   ",randSCV.best_params_)
print("Best Index:                     ",randSCV.best_index_)
print("Scorer:                         ",randSCV.scorer_)
print("number of splits:               ",randSCV.n_splits_)
print("refit_time:                     ",randSCV.refit_time_)


BMRand_GrBC_Pipeline = randSCV.best_estimator_ # BM: best model
y_pred_RandBMGrBC = BMRand_GrBC_Pipeline.predict(X_ts)

print('\n\n')
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print('Score on the TRAIN set:         ', BMRand_GrBC_Pipeline.score(X_tr,y_tr))
print('Score on the TEST set:          ', accuracy_score(y_ts, y_pred_RandBMGrBC))

Complete details of the model:   Pipeline(memory=None,
         steps=[('Scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('GrBC',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=7, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=41,
                                            n_iter_no

# RANDOM FOREST REGRESSOR WITHOUT PREPROCESSING

In [None]:
RFC_npp = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=7, warm_start=True)  #(learning_rate=0.05, n_estimators=50, max_depth=6, random_state=7, warm_start=True)
RFC_npp.fit(X_tr, y_tr)
y_pred = RFC_npp.predict(X_ts)
print('Score on the TRAIN set: ', RFC_npp.score(X_tr, y_tr))
print('Score on the TEST set: ', accuracy_score(y_ts, y_pred))
print('Feature importances: ' ,RFC_npp.feature_importances_)

Score on the TRAIN set:  0.6427184466019418
Score on the TEST set:  0.6014492753623188
Feature importances:  [0.34404791 0.35462571 0.30132638]


# RANDOM FOREST REGRESSOR with PREPROCESSING, RANDOMIZEDSEARCH CV, CROSSVALIDATION

In [None]:
RFC_Pipeline = Pipeline([
                  ('Scale', StandardScaler()),
                  ('RFC', RandomForestClassifier(random_state=7, warm_start=True))
                ])

param_grid = {
              'RFC__n_estimators': [38,39,40,41,42],
              'RFC__max_depth': [3,4,5,6]
             } 

randSCV = RandomizedSearchCV(estimator=RFC_Pipeline, param_distributions=param_grid, n_iter=12, scoring='accuracy', cv=10, refit=True, return_train_score=True)

randSCV.fit(X_tr, y_tr)

#print(gridSCV.cv_results_)
print("Complete details of the model:  ",randSCV.best_estimator_)
print("Best scorer:                    ",randSCV.best_score_)
print("Best Hyperparameters choosen:   ",randSCV.best_params_)
print("Best Index:                     ",randSCV.best_index_)
print("Scorer:                         ",randSCV.scorer_)
print("number of splits:               ",randSCV.n_splits_)
print("refit_time:                     ",randSCV.refit_time_)


BMRand_RFC_Pipeline = randSCV.best_estimator_ # BM: best model

scores = cross_val_score(estimator=BMRand_RFC_Pipeline, X=X_tr, y=y_tr, cv=15, scoring='accuracy') 

y_pred_RandBMRFC = BMRand_RFC_Pipeline.predict(X_ts)

print('++++++++++++++++Scores: ')
for i in scores:
  print(i)
print('\n')  
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
print('\n')
print('Score on the TEST set:          ', accuracy_score(y_ts, y_pred_RandBMRFC))

Complete details of the model:   Pipeline(memory=None,
         steps=[('Scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('RFC',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=6, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=42, n_jobs=None,
                                        oob_score=False, random_state=7,
                                        verbose=0, warm_start=True))],
         verbose=Fals

# LOGISTIC REGRASSION, PREPROCESSING, RANDOMSEARCH AND CROSS VALIDATION

In [None]:
LR_pipeline = Pipeline([
               ('Scale', StandardScaler()),
               ('LR', LogisticRegression(solver='newton-cg',random_state=7)) 
])

LR_pipeline.fit(X_tr, y_tr)
LR_scores = cross_val_score(estimator=LR_pipeline, X=X_tr, y=y_tr, cv=15, scoring='accuracy')

y_pred_LR = LR_pipeline.predict(X_ts)

print('++++++++++++++++Scores: ')
for i in LR_scores:
  print(i)
print('\n')  
print("Mean:", LR_scores.mean())
print("Standard deviation:", LR_scores.std())
print('\n')
print('Score on the TEST set:          ', accuracy_score(y_ts, y_pred_LR))

++++++++++++++++Scores: 
0.6285714285714286
0.6857142857142857
0.5714285714285714
0.5714285714285714
0.42857142857142855
0.5294117647058824
0.7352941176470589
0.5588235294117647
0.5882352941176471
0.5
0.6176470588235294
0.5882352941176471
0.5
0.5
0.5


Mean: 0.5668907563025211
Standard deviation: 0.07672951516079382


Score on the TEST set:           0.4420289855072464


# SUPPORT VECTOR CLASSIFIER, PREPROCESSING, RANDOMSEARCH AND CROSS VALIDATION

In [None]:
SVC_pipeline = Pipeline([
                          ('Scale',StandardScaler()),
                          ('SVC', SVC(kernel='poly', random_state=7))
])

param_grid = {
              'SVC__degree' : [4,5,6],
              'SVC__coef0' : [8,9,10,11]
}

SVC_pipeline_randSCV = RandomizedSearchCV(estimator=SVC_pipeline, param_distributions=param_grid,\
                                           n_iter=7, scoring='accuracy', cv=10, refit=True, return_train_score=True)

SVC_pipeline_randSCV.fit(X_tr, y_tr)

#print(gridSCV.cv_results_)
print("Complete details of the model:  ",SVC_pipeline_randSCV.best_estimator_)
print("Best scorer:                    ",SVC_pipeline_randSCV.best_score_)
print("Best Hyperparameters choosen:   ",SVC_pipeline_randSCV.best_params_)
print("Best Index:                     ",SVC_pipeline_randSCV.best_index_)
print("Scorer:                         ",SVC_pipeline_randSCV.scorer_)
print("number of splits:               ",SVC_pipeline_randSCV.n_splits_)
print("refit_time:                     ",SVC_pipeline_randSCV.refit_time_)


BM_Rand_SVC_Pipeline = SVC_pipeline_randSCV.best_estimator_ # BM: best model

scores = cross_val_score(estimator=BM_Rand_SVC_Pipeline, X=X_tr, y=y_tr, cv=15, scoring='accuracy') 

y_pred = BM_Rand_SVC_Pipeline.predict(X_ts)

print('++++++++++++++++Scores: ')
for i in scores:
  print(i)
print('\n')  
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
print('\n')
print('Score on the TEST set:          ', accuracy_score(y_ts, y_pred))

Complete details of the model:   Pipeline(memory=None,
         steps=[('Scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('SVC',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=10, decision_function_shape='ovr', degree=5,
                     gamma='scale', kernel='poly', max_iter=-1,
                     probability=False, random_state=7, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
Best scorer:                     0.7493589743589744
Best Hyperparameters choosen:    {'SVC__degree': 5, 'SVC__coef0': 10}
Best Index:                      2
Scorer:                          make_scorer(accuracy_score)
number of splits:                10
refit_time:                      0.5589654445648193
++++++++++++++++Scores: 
0.8285714285714286
0.8
0.6
0.8571428571428571
0.7428571428571429
0.7352941176470589
0.8235294117647058
0.7647058823529411
0.5

# ADABOOST CLASSIFIER, PREPROCESSING, RANDOMSEARCH AND CROSS VALIDATION

In [None]:
ADACL_pipeline = Pipeline([
                  ('scaler', StandardScaler()),
                  ('ada_c', AdaBoostClassifier(random_state=7))
                 ])

param_grid = {
              'ada_c__n_estimators' : [30,35,40,45,50],
              'ada_c__learning_rate' : [0.4, 0.45, 0.5]
              }

ADACL_pipeRSCV = RandomizedSearchCV(estimator=ADACL_pipeline, param_distributions=param_grid,\
                                           n_iter=10, scoring='accuracy', cv=10, refit=True, return_train_score=True)
ADACL_pipeRSCV.fit(X_tr, y_tr)

print("Complete details of the model:  ",ADACL_pipeRSCV.best_estimator_)
print("Best scorer:                    ",ADACL_pipeRSCV.best_score_)
print("Best Hyperparameters choosen:   ",ADACL_pipeRSCV.best_params_)
print("Best Index:                     ",ADACL_pipeRSCV.best_index_)
print("Scorer:                         ",ADACL_pipeRSCV.scorer_)
print("number of splits:               ",ADACL_pipeRSCV.n_splits_)
print("refit_time:                     ",ADACL_pipeRSCV.refit_time_)


ADACL_bestm = ADACL_pipeRSCV.best_estimator_ # BM: best model

scores = cross_val_score(estimator=ADACL_bestm, X=X_tr, y=y_tr, cv=15, scoring='accuracy') 

y_pred = ADACL_bestm.predict(X_ts)

print('++++++++++++++++Scores: ')
for i in scores:
  print(i)
print('\n')  
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
print('\n')
print('Score on the TEST set:          ', accuracy_score(y_ts, y_pred))

Complete details of the model:   Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ada_c',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=0.45, n_estimators=40,
                                    random_state=7))],
         verbose=False)
Best scorer:                     0.4153469079939668
Best Hyperparameters choosen:    {'ada_c__n_estimators': 40, 'ada_c__learning_rate': 0.45}
Best Index:                      9
Scorer:                          make_scorer(accuracy_score)
number of splits:                10
refit_time:                      0.0789341926574707
++++++++++++++++Scores: 
0.37142857142857144
0.34285714285714286
0.3142857142857143
0.37142857142857144
0.42857142857142855
0.29411764705882354
0.3235294117647059
0.35294117647058826
0.4411764705882353
0.47058823529411764
0.38235294117647056
0.382352941176470