## Machine Learning Foundations
# Hyperparameter Tuning

Sumudu Tennakoon, PhD
<hr>

To learn more about Python, refeer to the following websites

* Python : www.python.org

To learn more about the Python packages we explore in this notebook, refeer to the following websites

* NumPy : www.numpy.org
* Matplotlib : www.matplotlib.org
* Pandas : https://pandas.pydata.org
* Scikit-Learn : https://scikit-learn.org/
* Seaborn: https://seaborn.pydata.org/
* StatsModel : https://www.statsmodels.org

In [1]:
import pandas as pd
import numpy as np

#train test split
from sklearn.model_selection import train_test_split

# Classfiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Parameter Serach Methods
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## 1. Prepare Dataset

### Get Dataset

In [2]:
file_name = 'https://raw.githubusercontent.com/SumuduTennakoon/MLFoundations/main/Datasets/income_data.csv'

# Load CSV File
data = pd.read_csv(file_name)
data.sample(5)

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
29741,4923,31,Private,168854,Some-college,10,Never-married,Sales,Not-in-family,White,Male,0.0,1504.0,40.0,United-States,<=50K.
34069,9251,72,?,235014,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0.0,2465.0,40.0,United-States,<=50K.
15384,15390,59,Private,98361,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
25848,1030,23,Private,211345,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0.0,0.0,40.0,Nicaragua,<=50K.
33908,9090,28,Private,252424,Assoc-voc,11,Never-married,Transport-moving,Own-child,Black,Male,0.0,0.0,40.0,Cambodia,<=50K.


### Pre-process Dataset

In [3]:
# Drop unnecessry columns and missing rows
data.drop(labels='Unnamed: 0', axis=1, inplace=True)
data.dropna(how='any', axis=0, inplace=True)

In [4]:
def merge_marital_statuss_catergory(education):
    if education in (' Married-civ-spouse',' Married-spouse-absent', ' Married-AF-spouse'):
        return 2 #'Married'
    elif education in (' Never-married', ' Divorced', ' Widowed',' Separated'):   
        return 1 #'Single'
    else:
        return None
        
data['marital_status'] = data['marital_status'].apply(merge_marital_statuss_catergory)
pd.crosstab(data['marital_status'], data['class'], margins=True, margins_name='Total')

class,<=50K,<=50K.,>50K,>50K.,Total
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12268,8124,842,526,21760
2,6599,4306,5109,3320,19334
Total,18867,12430,5951,3846,41094


### Prepare Target (y) Variable

In [5]:
# Correct class labeling
data['class'] = data['class'].replace(' >50K.', ' >50K')
data['class'] = data['class'].replace(' <=50K.', ' <=50K')
data['y_act'] = np.where(data['class']==' >50K',1,0)
data['y_act'].value_counts()

0    31297
1     9797
Name: y_act, dtype: int64

### Prepare Feature dataset

In [6]:
X_variables = ['age',  'hours_per_week', 'education_num', 'marital_status']
y_varibale = 'y_act'

X = data[X_variables]

print(X.head())
print('\n')
print(X.describe())

   age  hours_per_week  education_num  marital_status
0   39            40.0             13               1
1   50            13.0             13               2
2   38            40.0              9               1
3   53            40.0              7               2
4   28            40.0             13               2


                age  hours_per_week  education_num  marital_status
count  41094.000000    41094.000000   41094.000000    41094.000000
mean      38.669757       40.404585      10.079355        1.470482
std       13.748166       12.371882       2.560937        0.499134
min       17.000000        1.000000       1.000000        1.000000
25%       28.000000       40.000000       9.000000        1.000000
50%       37.000000       40.000000      10.000000        1.000000
75%       48.000000       45.000000      12.000000        2.000000
max       90.000000       99.000000      16.000000        2.000000


### Apply `StandardScaler`

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)

X = pd.DataFrame(scaler.transform(X), columns=X.columns)

y = data[y_varibale]
print(X.head())
print('\n')
print(X.describe())

        age  hours_per_week  education_num  marital_status
0  0.024021       -0.032702       1.140474       -0.942609
1  0.824138       -2.215097       1.140474        1.060886
2 -0.048717       -0.032702      -0.421474       -0.942609
3  1.042351       -0.032702      -1.202448        1.060886
4 -0.776095       -0.032702       1.140474        1.060886


                age  hours_per_week  education_num  marital_status
count  4.109400e+04    4.109400e+04   4.109400e+04    4.109400e+04
mean   1.735983e-16   -1.655582e-17  -2.334240e-16   -1.850102e-16
std    1.000012e+00    1.000012e+00   1.000012e+00    1.000012e+00
min   -1.576212e+00   -3.185050e+00  -3.545369e+00   -9.426086e-01
25%   -7.760953e-01   -3.270234e-02  -4.214738e-01   -9.426086e-01
50%   -1.214546e-01   -3.270234e-02  -3.098695e-02   -9.426086e-01
75%    6.786619e-01    3.714448e-01   7.499868e-01    1.060886e+00
max    3.733652e+00    4.736234e+00   2.311934e+00    1.060886e+00


### Variable Correlation with Target (y)

In [8]:
correlation = pd.concat([X, y], axis=1).corr()[['y_act']].drop('y_act')
correlation.style.bar(align='mid', color=['#d65f5f', '#5fba7d'])

Unnamed: 0,y_act
age,0.139521
hours_per_week,0.137008
education_num,0.201737
marital_status,0.264779


### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(F"Train sample size = {len(X_train)}")
print(F"Test sample size  = {len(X_test)}")

Train sample size = 28765
Test sample size  = 12329


## 2. Model Training

### Train Logistic Regression Model

In [10]:
from sklearn.linear_model import LogisticRegression

features_to_model = X_variables

# Create Model
model = LogisticRegression()

# Train
model.fit(X_train, y_train)

# Feature importance/Coefficients
coefficients = model.coef_[0]
intercept = model.intercept_[0]
feature_profile = pd.DataFrame({"feature":features_to_model, "coefficients":coefficients})
print("feature_profile:\n", feature_profile)
print("intercept:", intercept)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

feature_profile:
           feature  coefficients
0             age      0.413008
1  hours_per_week      0.414359
2   education_num      0.969832
3  marital_status      1.159059
intercept: -1.8842029313000297


Model Parameters:
 C                       1.0
class_weight           None
dual                  False
fit_intercept          True
intercept_scaling         1
l1_ratio               None
max_iter                100
multi_class            auto
n_jobs                 None
penalty                  l2
random_state           None
solver                lbfgs
tol                  0.0001
verbose                   0
warm_start            False
dtype: object


### Predict

In [11]:
# Predict
y_pred = model.predict(X_test[features_to_model])
y_pred_prob = model.predict_proba(X_test[features_to_model])[:,1]
print(y_pred_prob[:5])

test_result = pd.DataFrame(data={'y_act':y_test.values, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
test_result.sample(10)

[0.3644135  0.05197569 0.13738742 0.22635666 0.62836357]


Unnamed: 0,y_act,y_pred,y_pred_prob
9670,0,0,0.094077
9534,1,1,0.544257
1640,1,0,0.333838
2033,0,0,0.002941
9560,0,0,0.392672
9319,0,0,0.049164
420,0,0,0.043474
4925,1,0,0.491308
1892,0,0,0.270104
12128,1,0,0.140668


### Model Evaluation

In [12]:
from sklearn import metrics

cfm = pd.crosstab(test_result['y_act'], test_result['y_pred'], margins=True)
print("Confusion Matrix:\n", cfm)
print('\n')

# Model evaluation
# Use Scikit-Learn function (lgr = Logistic Regression)
acuracy_lgr = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
print("acuracy_lgr:", acuracy_lgr)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
f1_score_lgr = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average=None)  #weighted accounts for label imbalance.
print("f1_score_lgr:",f1_score_lgr)
print('\n')

from sklearn.metrics import classification_report
print("classification_report:\n",classification_report(test_result['y_act'], test_result['y_pred']))

Confusion Matrix:
 y_pred      0     1    All
y_act                     
0        8619   704   9323
1        1540  1466   3006
All     10159  2170  12329


acuracy_lgr: 0.817990104631357
f1_score_lgr: [0.88481675 0.56646059]


classification_report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88      9323
           1       0.68      0.49      0.57      3006

    accuracy                           0.82     12329
   macro avg       0.76      0.71      0.73     12329
weighted avg       0.81      0.82      0.81     12329



## Train Decision Tree

In [13]:
from sklearn.linear_model import LogisticRegression

features_to_model = X_variables

# Create Model
model = DecisionTreeClassifier()

# Train
model.fit(X_train, y_train)

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

feature_profile:
           feature  importance
0             age    0.240750
1  hours_per_week    0.185898
2   education_num    0.247762
3  marital_status    0.325590


Model Parameters:
 ccp_alpha                    0.0
class_weight                None
criterion                   gini
max_depth                   None
max_features                None
max_leaf_nodes              None
min_impurity_decrease        0.0
min_samples_leaf               1
min_samples_split              2
min_weight_fraction_leaf     0.0
random_state                None
splitter                    best
dtype: object


### Model Evaluation

In [14]:
# Create a reusable function
def evaluate_model(model, X_test, features_to_model):
    # Predict
    y_pred = model.predict(X_test[features_to_model])
    y_pred_prob = model.predict_proba(X_test[features_to_model])[:,1]
    print(y_pred_prob[:5])

    test_result = pd.DataFrame(data={'y_act':y_test.values, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
    print(test_result.sample(10))
    print('\n')

    from sklearn import metrics

    cfm = pd.crosstab(test_result['y_act'], test_result['y_pred'], margins=True)
    print("Confusion Matrix:\n", cfm)
    print('\n')

    # Model evaluation
    # Use Scikit-Learn function (lgr = Logistic Regression)
    acuracy = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
    print("acuracy:", acuracy)

    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1_score = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average=None)  #weighted accounts for label imbalance.
    print("f1_score:",f1_score)
    print('\n')

    from sklearn.metrics import classification_report
    print("classification_report:\n",classification_report(test_result['y_act'], test_result['y_pred']))

evaluate_model(model, X_test, features_to_model)

[0.27906977 0.         0.         0.         0.54545455]
       y_act  y_pred  y_pred_prob
11822      0       0     0.000000
4663       0       1     0.645161
3596       0       0     0.000000
8858       0       1     0.666667
9014       0       0     0.000000
1044       0       0     0.200000
10272      0       0     0.000000
3412       0       0     0.000000
7919       0       0     0.028986
9309       0       0     0.000000


Confusion Matrix:
 y_pred     0     1    All
y_act                    
0       8373   950   9323
1       1560  1446   3006
All     9933  2396  12329


acuracy: 0.7964149566063752
f1_score: [0.86965102 0.53535728]


classification_report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      9323
           1       0.60      0.48      0.54      3006

    accuracy                           0.80     12329
   macro avg       0.72      0.69      0.70     12329
weighted avg       0.78      0.80      0.79     12329



## 3. Seraching for Optimum Hyperparameters

### 3.1 `GridSearchCV`

#### Logistic Regression

In [15]:
from sklearn.model_selection import GridSearchCV

# Define Hyperparameter Grid
param_grid = {'C': [-1, 0.5, 1, 2, 5, 10]}
  
# Create model object
model = LogisticRegression()
  
# Create GridSearchCV object
model_cv = GridSearchCV(model, param_grid, cv=5, scoring='f1')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

Tuned Model Parameters: {'C': 10}
Best model score: 0.5601132349519123


5 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Python\python-3.9.4.amd64\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Python\python-3.9.4.amd64\lib\site-packages\sklearn\linear_model\_logistic.py", line 1464, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
ValueError: Penalty term must be positive; got (C=-1)



#### Get The Best Model

In [16]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
coefficients = model.coef_[0]
intercept = model.intercept_[0]
feature_profile = pd.DataFrame({"feature":features_to_model, "coefficients":coefficients})
print("feature_profile:\n", feature_profile)
print("intercept:", intercept)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)


feature_profile:
           feature  coefficients
0             age      0.413143
1  hours_per_week      0.414494
2   education_num      0.970269
3  marital_status      1.159556
intercept: -1.884691330044252


Model Parameters:
 C                        10
class_weight           None
dual                  False
fit_intercept          True
intercept_scaling         1
l1_ratio               None
max_iter                100
multi_class            auto
n_jobs                 None
penalty                  l2
random_state           None
solver                lbfgs
tol                  0.0001
verbose                   0
warm_start            False
dtype: object
[0.36441803 0.05192725 0.13732913 0.22632999 0.62849444]
       y_act  y_pred  y_pred_prob
9845       0       0     0.011892
10674      1       1     0.679915
5850       0       0     0.212903
2729       0       0     0.145997
7184       0       0     0.025069
9002       1       1     0.770055
3884       0       0     0.077954
6082    

#### Descision Tree

In [17]:
from sklearn.model_selection import GridSearchCV

# Define Hyperparameter Grid
param_grid = {"max_depth": [5, 10],
              "min_samples_leaf": [2, 10, 100],
              "min_samples_split": [2, 10, 100]
             }
  
# Create model object
model = DecisionTreeClassifier()
 
# Create GridSearchCV object
model_cv = GridSearchCV(model, param_grid, cv=3, scoring='f1')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

Tuned Model Parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 100}
Best model score: 0.598803082673162


#### Get Best Model

In [18]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)


feature_profile:
           feature  importance
0             age    0.102202
1  hours_per_week    0.077442
2   education_num    0.325678
3  marital_status    0.494677


Model Parameters:
 ccp_alpha                    0.0
class_weight                None
criterion                   gini
max_depth                     10
max_features                None
max_leaf_nodes              None
min_impurity_decrease        0.0
min_samples_leaf              10
min_samples_split            100
min_weight_fraction_leaf     0.0
random_state                None
splitter                    best
dtype: object
[0.38504937 0.         0.08333333 0.02247191 0.79273504]
       y_act  y_pred  y_pred_prob
5699       1       0     0.406103
152        0       0     0.000000
4625       1       0     0.333333
4440       1       0     0.385049
5905       1       1     0.792735
11186      1       1     0.521073
2073       0       0     0.000000
6020       1       0     0.469136
11211      0       0     0.000000
1089

### 3.2 `RandomizedSearchCV`

#### Descicion Tree

In [19]:
from sklearn.model_selection import RandomizedSearchCV
    
# Define Hyperparameter Grid
param_grid = {"max_depth": [3, 10, 100],
              "min_samples_leaf": [5, 10, 50, 100, 200],
              "min_samples_split": [5, 10, 20, 100]
             }
  
# Create model object
model = DecisionTreeClassifier()
  
# Create RandomizedSearchCV object
model_cv = RandomizedSearchCV(model, param_grid, cv=5, scoring='precision')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

Tuned Model Parameters: {'min_samples_split': 20, 'min_samples_leaf': 10, 'max_depth': 3}
Best model score: 0.7059251771725805


In [20]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)


feature_profile:
           feature  importance
0             age    0.000000
1  hours_per_week    0.041041
2   education_num    0.348718
3  marital_status    0.610240


Model Parameters:
 ccp_alpha                    0.0
class_weight                None
criterion                   gini
max_depth                      3
max_features                None
max_leaf_nodes              None
min_impurity_decrease        0.0
min_samples_leaf              10
min_samples_split             20
min_weight_fraction_leaf     0.0
random_state                None
splitter                    best
dtype: object
[0.35927236 0.02719503 0.11867583 0.35927236 0.72826981]
       y_act  y_pred  y_pred_prob
10754      1       1     0.728270
7095       1       0     0.118676
2164       0       0     0.027195
3780       1       1     0.728270
3378       0       0     0.027195
2817       0       0     0.359272
11881      0       0     0.359272
3710       0       0     0.359272
4736       0       0     0.126486
5772

#### Random Forest Classifier

In [21]:
from sklearn.model_selection import RandomizedSearchCV
    
# Define Hyperparameter Grid
param_grid = {"max_depth": [3, 10, 100],
              "min_samples_leaf": [5, 10, 50, 100, 200],
              "min_samples_split": [5, 10, 20, 100]
             }
  
# Create model object
model = RandomForestClassifier()
  
# Create RandomizedSearchCV object
model_cv = RandomizedSearchCV(model, param_grid, cv=5, scoring='f1')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

Tuned Model Parameters: {'min_samples_split': 20, 'min_samples_leaf': 10, 'max_depth': 100}
Best model score: 0.5820416079423019


In [22]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)

feature_profile:
           feature  importance
0             age    0.191971
1  hours_per_week    0.113005
2   education_num    0.296556
3  marital_status    0.398468


Model Parameters:
 bootstrap                    True
ccp_alpha                     0.0
class_weight                 None
criterion                    gini
max_depth                     100
max_features                 auto
max_leaf_nodes               None
max_samples                  None
min_impurity_decrease         0.0
min_samples_leaf               10
min_samples_split              20
min_weight_fraction_leaf      0.0
n_estimators                  100
n_jobs                       None
oob_score                   False
random_state                 None
verbose                         0
warm_start                  False
dtype: object
[0.31810977 0.02420742 0.14910756 0.02311026 0.66006083]
       y_act  y_pred  y_pred_prob
9452       0       0     0.017286
7006       0       0     0.019903
12029      0       0     0

<hr>
Last update 2022-03-21 by Sumudu Tennakoon

<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.