## Tunability Importance of Hyperparameters of ML Algorithms

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split

In [None]:
data_banana = pd.read_csv("../data/banana_quality.csv")
data_fitness = pd.read_csv("../data/fitness_dataset.csv")
data_heart = pd.read_csv("../data/heart.csv")
data_HR = pd.read_csv("../data/recruitment_data.csv")
data_ecommerce = pd.read_csv("../data/ecommerce.csv")
data_placement = pd.read_csv("../data/placementdata.csv")
data_insurance = pd.read_csv("../data/TravelInsurancePrediction.csv")
data_churn = pd.read_csv("../data/Churn_Modelling.csv")


In [8]:
data_banana.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Size         8000 non-null   float64
 1   Weight       8000 non-null   float64
 2   Sweetness    8000 non-null   float64
 3   Softness     8000 non-null   float64
 4   HarvestTime  8000 non-null   float64
 5   Ripeness     8000 non-null   float64
 6   Acidity      8000 non-null   float64
 7   Quality      8000 non-null   object 
dtypes: float64(7), object(1)
memory usage: 500.1+ KB


In [71]:
data_fitness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                2000 non-null   int64  
 1   height_cm          2000 non-null   int64  
 2   weight_kg          2000 non-null   int64  
 3   heart_rate         2000 non-null   float64
 4   blood_pressure     2000 non-null   float64
 5   sleep_hours        1840 non-null   float64
 6   nutrition_quality  2000 non-null   float64
 7   activity_index     2000 non-null   float64
 8   smokes             2000 non-null   object 
 9   gender             2000 non-null   object 
 10  is_fit             2000 non-null   int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 172.0+ KB


In [73]:
data_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [75]:
data_HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1500 non-null   int64  
 1   Gender               1500 non-null   int64  
 2   EducationLevel       1500 non-null   int64  
 3   ExperienceYears      1500 non-null   int64  
 4   PreviousCompanies    1500 non-null   int64  
 5   DistanceFromCompany  1500 non-null   float64
 6   InterviewScore       1500 non-null   int64  
 7   SkillScore           1500 non-null   int64  
 8   PersonalityScore     1500 non-null   int64  
 9   RecruitmentStrategy  1500 non-null   int64  
 10  HiringDecision       1500 non-null   int64  
dtypes: float64(1), int64(10)
memory usage: 129.0 KB


In [48]:
data_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [102]:
data_placement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  10000 non-null  int64  
 1   CGPA                       10000 non-null  float64
 2   Internships                10000 non-null  int64  
 3   Projects                   10000 non-null  int64  
 4   Workshops/Certifications   10000 non-null  int64  
 5   AptitudeTestScore          10000 non-null  int64  
 6   SoftSkillsRating           10000 non-null  float64
 7   ExtracurricularActivities  10000 non-null  object 
 8   PlacementTraining          10000 non-null  object 
 9   SSC_Marks                  10000 non-null  int64  
 10  HSC_Marks                  10000 non-null  int64  
 11  PlacementStatus            10000 non-null  object 
dtypes: float64(2), int64(7), object(3)
memory usage: 937.6+ KB


In [123]:
data_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           1987 non-null   int64 
 1   Age                  1987 non-null   int64 
 2   Employment Type      1987 non-null   object
 3   GraduateOrNot        1987 non-null   object
 4   AnnualIncome         1987 non-null   int64 
 5   FamilyMembers        1987 non-null   int64 
 6   ChronicDiseases      1987 non-null   int64 
 7   FrequentFlyer        1987 non-null   object
 8   EverTravelledAbroad  1987 non-null   object
 9   TravelInsurance      1987 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 155.4+ KB


In [130]:
data_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [None]:
y_banana = data_banana.loc[:,'Quality']
X_banana = data_banana.drop(['Quality'], axis = 1)
y_fitness = data_fitness.loc[:, 'is_fit']
X_fitness = data_fitness.drop(['is_fit'], axis = 1)
y_heart = data_heart.loc[:,'HeartDisease']
X_heart = data_heart.drop(['HeartDisease'], axis = 1)
y_HR = data_HR.loc[:, 'HiringDecision']
X_HR = data_HR.drop(['HiringDecision'], axis = 1)
y_ecommerce = data_ecommerce.loc[:,'Reached.on.Time_Y.N']
X_ecommerce = data_ecommerce.drop(['Reached.on.Time_Y.N'], axis = 1)
y_placement = data_placement.loc[:,'PlacementStatus']
X_placement = data_placement.drop(['PlacementStatus'], axis = 1)
y_insurance = data_insurance.loc[:,'TravelInsurance']
X_insurance = data_insurance.drop(['TravelInsurance'], axis = 1)
y_churn = data_churn.loc[:,'Exited']
X_churn = data_churn.drop(['Exited'], axis = 1)

In [20]:
y_banana = y_banana.map({'Bad':0, 'Good':1})

In [21]:
X_fitness['smokes'] = X_fitness['smokes'].replace({'yes':1, 'no':0}).astype(int)

In [56]:
X_ecommerce = X_ecommerce.drop(['ID'], axis = 1)
X_ecommerce['Gender'] = X_ecommerce['Gender'].replace({'M':1, 'F':0}).astype(int)

  X_ecommerce['Gender'] = X_ecommerce['Gender'].replace({'M':1, 'F':0}).astype(int)


In [106]:
X_placement = X_placement.drop(['StudentID'], axis = 1)
X_placement['ExtracurricularActivities'] = X_placement['ExtracurricularActivities'].replace({'Yes':1, 'No':0}).astype(int)
X_placement['PlacementTraining'] = X_placement['PlacementTraining'].replace({'Yes':1, 'No':0}).astype(int)
y_placement = y_placement.replace({'Placed':1, 'NotPlaced':0}).astype(int)

  X_placement['ExtracurricularActivities'] = X_placement['ExtracurricularActivities'].replace({'Yes':1, 'No':0}).astype(int)
  X_placement['PlacementTraining'] = X_placement['PlacementTraining'].replace({'Yes':1, 'No':0}).astype(int)
  y_placement = y_placement.replace({'Placed':1, 'NotPlaced':0}).astype(int)


In [127]:
X_insurance = X_insurance.drop(['Unnamed: 0'], axis = 1)
X_insurance['GraduateOrNot'] = X_insurance['GraduateOrNot'].replace({'Yes':1, 'No':0}).astype(int)
X_insurance['FrequentFlyer'] = X_insurance['FrequentFlyer'].replace({'Yes':1, 'No':0}).astype(int)
X_insurance['EverTravelledAbroad'] = X_insurance['EverTravelledAbroad'].replace({'Yes':1, 'No':0}).astype(int)


  X_insurance['GraduateOrNot'] = X_insurance['GraduateOrNot'].replace({'Yes':1, 'No':0}).astype(int)
  X_insurance['FrequentFlyer'] = X_insurance['FrequentFlyer'].replace({'Yes':1, 'No':0}).astype(int)
  X_insurance['EverTravelledAbroad'] = X_insurance['EverTravelledAbroad'].replace({'Yes':1, 'No':0}).astype(int)


In [134]:
X_churn = X_churn.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
X_churn['Gender'] = X_churn['Gender'].replace({'Female':1, 'Male':0}).astype(int)

  X_churn['Gender'] = X_churn['Gender'].replace({'Female':1, 'Male':0}).astype(int)


In [42]:
cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

In [43]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

In [44]:
col_transformer = ColumnTransformer(transformers=[
    ('num-pipeline',num_pipeline, make_column_selector(dtype_include=np.number)),
    ('cat-pipeline',cat_pipeline, make_column_selector(dtype_include=np.object_))
])

In [45]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', col_transformer),
    ('model', RandomForestClassifier())
])

In [95]:
cross_validate(model_pipeline, X_banana, y_banana)

{'fit_time': array([1.57075453, 1.66700673, 1.57266498, 1.62773442, 1.5970397 ]),
 'score_time': array([0.07028484, 0.03342199, 0.02040911, 0.03024554, 0.02393603]),
 'test_score': array([0.959375, 0.95875 , 0.95875 , 0.9775  , 0.9625  ])}

In [97]:
cross_validate(model_pipeline, X_heart, y_heart)

{'fit_time': array([0.24624944, 0.22908688, 0.24077225, 0.22376537, 0.22249675]),
 'score_time': array([0.01804018, 0.02588916, 0.01509261, 0.01696372, 0.01714921]),
 'test_score': array([0.88586957, 0.83152174, 0.8423913 , 0.83060109, 0.7704918 ])}

In [109]:
cross_validate(model_pipeline, X_fitness, y_fitness)

{'fit_time': array([0.47198176, 0.45459151, 0.41576147, 0.44129705, 0.4324882 ]),
 'score_time': array([0.01353693, 0.01581407, 0.01776218, 0.01798964, 0.03025007]),
 'test_score': array([0.7875, 0.755 , 0.75  , 0.7925, 0.8075])}

In [111]:
cross_validate(model_pipeline, X_HR, y_HR)

{'fit_time': array([0.29161882, 0.30934834, 0.3479414 , 0.30029321, 0.26994467]),
 'score_time': array([0.02298927, 0.02048898, 0.01734543, 0.0180738 , 0.01153278]),
 'test_score': array([0.95666667, 0.96333333, 0.97      , 0.95666667, 0.79333333])}

In [58]:
cross_validate(model_pipeline, X_ecommerce, y_ecommerce)

{'fit_time': array([0.79782677, 0.76621556, 0.67122507, 0.63390923, 0.73301721]),
 'score_time': array([0.03327894, 0.02911687, 0.03783584, 0.02770805, 0.03540778]),
 'test_score': array([0.79545455, 0.79136364, 0.58545455, 0.43      , 0.43656207])}

In [107]:
cross_validate(model_pipeline, X_placement, y_placement)

{'fit_time': array([1.00394559, 0.86944604, 0.93291593, 1.58191514, 0.84842706]),
 'score_time': array([0.03828955, 0.05084991, 0.05010104, 0.05006003, 0.03386164]),
 'test_score': array([0.7775, 0.781 , 0.793 , 0.7935, 0.7895])}

In [128]:
cross_validate(model_pipeline, X_insurance, y_insurance)

{'fit_time': array([0.30760598, 0.32146525, 0.20015931, 0.20245862, 0.29564524]),
 'score_time': array([0.01823759, 0.02246642, 0.01369691, 0.01359057, 0.0131619 ]),
 'test_score': array([0.78140704, 0.81155779, 0.80604534, 0.76826196, 0.77581864])}

In [135]:
cross_validate(model_pipeline, X_churn, y_churn)

{'fit_time': array([1.1972549 , 1.28940797, 2.01117802, 1.21680951, 1.21700144]),
 'score_time': array([0.03935695, 0.06498218, 0.03341627, 0.03248978, 0.0332098 ]),
 'test_score': array([0.8645, 0.874 , 0.8645, 0.8655, 0.855 ])}

In [119]:
X_banana_train, X_banana_test, y_banana_train, y_banana_test = train_test_split(X_banana, y_banana, test_size=0.3, random_state=123)
X_fitness_train, X_fitness_test, y_fitness_train, y_fitness_test = train_test_split(X_fitness, y_fitness, test_size=0.3, random_state=123)
X_heart_train, X_heart_test, y_heart_train, y_heart_test = train_test_split(X_heart, y_heart, test_size=0.3, random_state=123)
X_HR_train, X_HR_test, y_HR_train, y_HR_test = train_test_split(X_HR, y_HR, test_size=0.3, random_state=123)