## Tunability Importance of Hyperparameters of ML Algorithms

In [115]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split

In [67]:
data_banana = pd.read_csv("../data/banana_quality.csv")
data_fitness = pd.read_csv("../data/fitness_dataset.csv")
data_heart = pd.read_csv("../data/heart.csv")
data_HR = pd.read_csv("../data/recruitment_data.csv")

In [69]:
data_banana.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Size         8000 non-null   float64
 1   Weight       8000 non-null   float64
 2   Sweetness    8000 non-null   float64
 3   Softness     8000 non-null   float64
 4   HarvestTime  8000 non-null   float64
 5   Ripeness     8000 non-null   float64
 6   Acidity      8000 non-null   float64
 7   Quality      8000 non-null   object 
dtypes: float64(7), object(1)
memory usage: 500.1+ KB


In [71]:
data_fitness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                2000 non-null   int64  
 1   height_cm          2000 non-null   int64  
 2   weight_kg          2000 non-null   int64  
 3   heart_rate         2000 non-null   float64
 4   blood_pressure     2000 non-null   float64
 5   sleep_hours        1840 non-null   float64
 6   nutrition_quality  2000 non-null   float64
 7   activity_index     2000 non-null   float64
 8   smokes             2000 non-null   object 
 9   gender             2000 non-null   object 
 10  is_fit             2000 non-null   int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 172.0+ KB


In [73]:
data_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [75]:
data_HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1500 non-null   int64  
 1   Gender               1500 non-null   int64  
 2   EducationLevel       1500 non-null   int64  
 3   ExperienceYears      1500 non-null   int64  
 4   PreviousCompanies    1500 non-null   int64  
 5   DistanceFromCompany  1500 non-null   float64
 6   InterviewScore       1500 non-null   int64  
 7   SkillScore           1500 non-null   int64  
 8   PersonalityScore     1500 non-null   int64  
 9   RecruitmentStrategy  1500 non-null   int64  
 10  HiringDecision       1500 non-null   int64  
dtypes: float64(1), int64(10)
memory usage: 129.0 KB


In [77]:
y_banana = data_banana.loc[:,'Quality']
X_banana = data_banana.drop(['Quality'], axis = 1)
y_fitness = data_fitness.loc[:, 'is_fit']
X_fitness = data_fitness.drop(['is_fit'], axis = 1)
y_heart = data_heart.loc[:,'HeartDisease']
X_heart = data_heart.drop(['HeartDisease'], axis = 1)
y_HR = data_HR.loc[:, 'HiringDecision']
X_HR = data_HR.drop(['HiringDecision'], axis = 1)

In [79]:
y_banana = y_banana.map({'Bad':0, 'Good':1})

In [105]:
X_fitness['smokes'] = X_fitness['smokes'].replace({'yes':1, 'no':0}).astype(int)

In [83]:
cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

In [85]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

In [87]:
col_transformer = ColumnTransformer(transformers=[
    ('num-pipeline',num_pipeline, make_column_selector(dtype_include=np.number)),
    ('cat-pipeline',cat_pipeline, make_column_selector(dtype_include=np.object_))
])

In [91]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', col_transformer),
    ('model', RandomForestClassifier())
])

In [95]:
cross_validate(model_pipeline, X_banana, y_banana)

{'fit_time': array([1.57075453, 1.66700673, 1.57266498, 1.62773442, 1.5970397 ]),
 'score_time': array([0.07028484, 0.03342199, 0.02040911, 0.03024554, 0.02393603]),
 'test_score': array([0.959375, 0.95875 , 0.95875 , 0.9775  , 0.9625  ])}

In [97]:
cross_validate(model_pipeline, X_heart, y_heart)

{'fit_time': array([0.24624944, 0.22908688, 0.24077225, 0.22376537, 0.22249675]),
 'score_time': array([0.01804018, 0.02588916, 0.01509261, 0.01696372, 0.01714921]),
 'test_score': array([0.88586957, 0.83152174, 0.8423913 , 0.83060109, 0.7704918 ])}

In [109]:
cross_validate(model_pipeline, X_fitness, y_fitness)

{'fit_time': array([0.47198176, 0.45459151, 0.41576147, 0.44129705, 0.4324882 ]),
 'score_time': array([0.01353693, 0.01581407, 0.01776218, 0.01798964, 0.03025007]),
 'test_score': array([0.7875, 0.755 , 0.75  , 0.7925, 0.8075])}

In [111]:
cross_validate(model_pipeline, X_HR, y_HR)

{'fit_time': array([0.29161882, 0.30934834, 0.3479414 , 0.30029321, 0.26994467]),
 'score_time': array([0.02298927, 0.02048898, 0.01734543, 0.0180738 , 0.01153278]),
 'test_score': array([0.95666667, 0.96333333, 0.97      , 0.95666667, 0.79333333])}

In [119]:
X_banana_train, X_banana_test, y_banana_train, y_banana_test = train_test_split(X_banana, y_banana, test_size=0.3, random_state=123)
X_fitness_train, X_fitness_test, y_fitness_train, y_fitness_test = train_test_split(X_fitness, y_fitness, test_size=0.3, random_state=123)
X_heart_train, X_heart_test, y_heart_train, y_heart_test = train_test_split(X_heart, y_heart, test_size=0.3, random_state=123)
X_HR_train, X_HR_test, y_HR_train, y_HR_test = train_test_split(X_HR, y_HR, test_size=0.3, random_state=123)