# Machine Learning Foundations
Sumudu Tennakoon, PhD

# Hyperparameter Tuning

To learn more about Python, refeer to the following websites

* Python : www.python.org

To learn more about the Python packages we explore in this notebook, refeer to the following websites

* NumPy : www.numpy.org
* Matplotlib : www.matplotlib.org
* Pandas : https://pandas.pydata.org
* Scikit-Learn : https://scikit-learn.org/
* Seaborn: https://seaborn.pydata.org/
* StatsModel : https://www.statsmodels.org

In [1]:
import pandas as pd
import numpy as np

#train test split
from sklearn.model_selection import train_test_split

# Classfiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Parameter Serach Methods
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## 1. Prepare Dataset

### Get Dataset

In [19]:
file_name = 'https://raw.githubusercontent.com/SumuduTennakoon/MLFoundations/main/Datasets/income_data.csv'

# Load CSV File
data = pd.read_csv(file_name)
data.sample(5)

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
21299,21308,18,Private,137363,12th,8,Never-married,Other-service,Own-child,White,Female,0.0,0.0,20.0,United-States,<=50K
11462,11465,49,Private,278322,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
33063,8245,24,Private,26668,Bachelors,13,Married-civ-spouse,Adm-clerical,Wife,White,Female,0.0,0.0,40.0,Puerto-Rico,<=50K.
30417,5599,37,Local-gov,105803,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
37804,12988,45,Private,226081,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.


### Pre-process Dataset

In [20]:
# Drop unnecessry columns and missing rows
data.drop(labels='Unnamed: 0', axis=1, inplace=True)
data.dropna(how='any', axis=0, inplace=True)

In [21]:
def merge_marital_statuss_catergory(education):
    if education in (' Married-civ-spouse',' Married-spouse-absent', ' Married-AF-spouse'):
        return 2 #'Married'
    elif education in (' Never-married', ' Divorced', ' Widowed',' Separated'):   
        return 1 #'Single'
    else:
        return None
        
data['marital_status'] = data['marital_status'].apply(merge_marital_statuss_catergory)
pd.crosstab(data['marital_status'], data['class'], margins=True, margins_name='Total')

class,<=50K,<=50K.,>50K,>50K.,Total
marital_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12268,8124,842,526,21760
2,6599,4306,5109,3320,19334
Total,18867,12430,5951,3846,41094


### Prepare Target (y) Variable

In [22]:
# Correct class labeling
data['class'] = data['class'].replace(' >50K.', ' >50K')
data['class'] = data['class'].replace(' <=50K.', ' <=50K')
data['y_act'] = np.where(data['class']==' >50K',1,0)
data['y_act'].value_counts()

0    31297
1     9797
Name: y_act, dtype: int64

### Prepare Feature dataset

In [23]:
X_variables = ['age',  'hours_per_week', 'education_num', 'marital_status']
y_varibale = 'y_act'

X = data[X_variables]

print(X.head())
print('\n')
print(X.describe())

   age  hours_per_week  education_num  marital_status
0   39            40.0             13               1
1   50            13.0             13               2
2   38            40.0              9               1
3   53            40.0              7               2
4   28            40.0             13               2


                age  hours_per_week  education_num  marital_status
count  41094.000000    41094.000000   41094.000000    41094.000000
mean      38.669757       40.404585      10.079355        1.470482
std       13.748166       12.371882       2.560937        0.499134
min       17.000000        1.000000       1.000000        1.000000
25%       28.000000       40.000000       9.000000        1.000000
50%       37.000000       40.000000      10.000000        1.000000
75%       48.000000       45.000000      12.000000        2.000000
max       90.000000       99.000000      16.000000        2.000000


### Apply `StandardScaler`

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)

X = pd.DataFrame(scaler.transform(X), columns=X.columns)

y = data[y_varibale]
print(X.head())
print('\n')
print(X.describe())

        age  hours_per_week  education_num  marital_status
0  0.024021       -0.032702       1.140474       -0.942609
1  0.824138       -2.215097       1.140474        1.060886
2 -0.048717       -0.032702      -0.421474       -0.942609
3  1.042351       -0.032702      -1.202448        1.060886
4 -0.776095       -0.032702       1.140474        1.060886


                age  hours_per_week  education_num  marital_status
count  4.109400e+04    4.109400e+04   4.109400e+04    4.109400e+04
mean   1.735983e-16   -1.655582e-17  -2.334240e-16   -1.850102e-16
std    1.000012e+00    1.000012e+00   1.000012e+00    1.000012e+00
min   -1.576212e+00   -3.185050e+00  -3.545369e+00   -9.426086e-01
25%   -7.760953e-01   -3.270234e-02  -4.214738e-01   -9.426086e-01
50%   -1.214546e-01   -3.270234e-02  -3.098695e-02   -9.426086e-01
75%    6.786619e-01    3.714448e-01   7.499868e-01    1.060886e+00
max    3.733652e+00    4.736234e+00   2.311934e+00    1.060886e+00


### Variable Correlation with Target (y)

In [25]:
correlation = pd.concat([X, y], axis=1).corr()[['y_act']].drop('y_act')
correlation.style.bar(align='mid', color=['#d65f5f', '#5fba7d'])

Unnamed: 0,y_act
age,0.139521
hours_per_week,0.137008
education_num,0.201737
marital_status,0.264779


### Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(F"Train sample size = {len(X_train)}")
print(F"Test sample size  = {len(X_test)}")

Train sample size = 28765
Test sample size  = 12329


## 2. Model Training

### Train Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression

features_to_model = X_variables

# Create Model
model = LogisticRegression()

# Train
model.fit(X_train, y_train)

# Feature importance/Coefficients
coefficients = model.coef_[0]
intercept = model.intercept_[0]
feature_profile = pd.DataFrame({"feature":features_to_model, "coefficients":coefficients})
print("feature_profile:\n", feature_profile)
print("intercept:", intercept)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

feature_profile:
           feature  coefficients
0             age      0.616352
1  hours_per_week      0.521773
2   education_num      0.879220
intercept: -1.5222134454079626


Model Parameters:
 C                         1
class_weight           None
dual                  False
fit_intercept          True
intercept_scaling         1
l1_ratio               None
max_iter                100
multi_class            auto
n_jobs                 None
penalty                  l2
random_state           None
solver                lbfgs
tol                  0.0001
verbose                   0
warm_start            False
dtype: object


### Predict

In [11]:
# Predict
y_pred = model.predict(X_test[features_to_model])
y_pred_prob = model.predict_proba(X_test[features_to_model])[:,1]
print(y_pred_prob[:5])

test_result = pd.DataFrame(data={'y_act':y_test.values, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
test_result.sample(10)

[0.2436276  0.26008232 0.06662007 0.08637721 0.39046274]


Unnamed: 0,y_act,y_pred,y_pred_prob
8874,0,0,0.105785
1160,1,0,0.089981
747,0,0,0.240813
11689,1,0,0.177062
6971,0,0,0.203912
2044,0,1,0.628458
6305,1,0,0.426148
505,0,0,0.387305
9995,0,0,0.05839
10265,0,0,0.056114


### Model Evaluation

In [12]:
from sklearn import metrics

cfm = pd.crosstab(test_result['y_act'], test_result['y_pred'], margins=True)
print("Confusion Matrix:\n", cfm)
print('\n')

# Model evaluation
# Use Scikit-Learn function (lgr = Logistic Regression)
acuracy_lgr = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
print("acuracy_lgr:", acuracy_lgr)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
f1_score_lgr = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average=None)  #weighted accounts for label imbalance.
print("f1_score_lgr:",f1_score_lgr)
print('\n')

from sklearn.metrics import classification_report
print("classification_report:\n",classification_report(test_result['y_act'], test_result['y_pred']))

Confusion Matrix:
 y_pred      0     1    All
y_act                     
0        8746   577   9323
1        2087   919   3006
All     10833  1496  12329


acuracy_lgr: 0.7839240814340174
f1_score_lgr: [0.86783092 0.40826299]


classification_report:
               precision    recall  f1-score   support

           0       0.81      0.94      0.87      9323
           1       0.61      0.31      0.41      3006

    accuracy                           0.78     12329
   macro avg       0.71      0.62      0.64     12329
weighted avg       0.76      0.78      0.76     12329



## Train Decision Tree

In [None]:
from sklearn.linear_model import LogisticRegression

features_to_model = X_variables

# Create Model
model = DecisionTreeClassifier()

# Train
model.fit(X_train, y_train)

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

### Model Evaluation

In [None]:
# Create a reusable function
def evaluate_model(model, X_test, features_to_model):
    # Predict
    y_pred = model.predict(X_test[features_to_model])
    y_pred_prob = model.predict_proba(X_test[features_to_model])[:,1]
    print(y_pred_prob[:5])

    test_result = pd.DataFrame(data={'y_act':y_test.values, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})
    print(test_result.sample(10))
    print('\n')

    from sklearn import metrics

    cfm = pd.crosstab(test_result['y_act'], test_result['y_pred'], margins=True)
    print("Confusion Matrix:\n", cfm)
    print('\n')

    # Model evaluation
    # Use Scikit-Learn function (lgr = Logistic Regression)
    acuracy = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
    print("acuracy:", acuracy)

    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    f1_score = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average=None)  #weighted accounts for label imbalance.
    print("f1_score:",f1_score)
    print('\n')

    from sklearn.metrics import classification_report
    print("classification_report:\n",classification_report(test_result['y_act'], test_result['y_pred']))

evaluate_model(model, X_test, features_to_model)

## 3. Seraching for Optimum Hyper Parameters

### 3.1 `GridSearchCV`

#### Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV

# Define Hyperparameter Grid
param_grid = {'C': [-1, 0.5, 1, 2, 5, 10]}
  
# Create model object
model = LogisticRegression()
  
# Create GridSearchCV object
model_cv = GridSearchCV(model, param_grid, cv=5, scoring='f1')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

#### Get Best Model

In [None]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
coefficients = model.coef_[0]
intercept = model.intercept_[0]
feature_profile = pd.DataFrame({"feature":features_to_model, "coefficients":coefficients})
print("feature_profile:\n", feature_profile)
print("intercept:", intercept)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)


#### Descision Tree

In [None]:
from sklearn.model_selection import GridSearchCV

# Define Hyperparameter Grid
param_grid = {"max_depth": [5, 10],
              "min_samples_leaf": [2, 10, 100],
              "min_samples_split": [2, 10, 100]
             }
  
# Create model object
model = DecisionTreeClassifier()
 
# Create GridSearchCV object
model_cv = GridSearchCV(model, param_grid, cv=3, scoring='f1')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

#### Get Best Model

In [None]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)


### 3.2 `RandomizedSearchCV`

#### Descicion Tree

In [None]:
from sklearn.model_selection import RandomizedSearchCV
    
# Define Hyperparameter Grid
param_grid = {"max_depth": [3, 10, 100],
              "min_samples_leaf": [5, 10, 50, 100, 200],
              "min_samples_split": [5, 10, 20, 100]
             }
  
# Create model object
model = DecisionTreeClassifier()
  
# Create RandomizedSearchCV object
model_cv = RandomizedSearchCV(model, param_grid, cv=5, scoring='precision')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

In [None]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)


#### Random Forest Classifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV
    
# Define Hyperparameter Grid
param_grid = {"max_depth": [3, 10, 100],
              "min_samples_leaf": [5, 10, 50, 100, 200],
              "min_samples_split": [5, 10, 20, 100]
             }
  
# Create model object
model = RandomForestClassifier()
  
# Create RandomizedSearchCV object
model_cv = RandomizedSearchCV(model, param_grid, cv=5, scoring='f1')
  
model_cv.fit(X_train[features_to_model], y_train)
  
# Print the tuned parameters and score
print("Tuned Model Parameters: {}".format(model_cv.best_params_))
print("Best model score: {}".format(model_cv.best_score_))

In [None]:
model = model_cv.best_estimator_

# Feature importance/Coefficients
importance = model.feature_importances_
feature_profile = pd.DataFrame({"feature":features_to_model, "importance":importance})
print("feature_profile:\n", feature_profile)
print('\n')
print("Model Parameters:\n", pd.Series(model.get_params()))

# Evaluate Model
evaluate_model(model, X_test, features_to_model)

<hr>
Last update 2022-03-21 by Sumudu Tennakoon

<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.