## Tunability Importance of Hyperparameters of ML Algorithms

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint, uniform, loguniform
import xgboost
from xgboost import XGBClassifier

In [3]:
data_fitness = pd.read_csv("../data/fitness_dataset.csv")
data_heart = pd.read_csv("../data/heart.csv")
data_ecommerce = pd.read_csv("../data/ecommerce.csv")
data_placement = pd.read_csv("../data/placementdata.csv")
data_insurance = pd.read_csv("../data/TravelInsurancePrediction.csv")
data_churn = pd.read_csv("../data/Churn_Modelling.csv")


In [4]:
data_fitness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                2000 non-null   int64  
 1   height_cm          2000 non-null   int64  
 2   weight_kg          2000 non-null   int64  
 3   heart_rate         2000 non-null   float64
 4   blood_pressure     2000 non-null   float64
 5   sleep_hours        1840 non-null   float64
 6   nutrition_quality  2000 non-null   float64
 7   activity_index     2000 non-null   float64
 8   smokes             2000 non-null   object 
 9   gender             2000 non-null   object 
 10  is_fit             2000 non-null   int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 172.0+ KB


In [5]:
data_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [6]:
data_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [7]:
data_placement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  10000 non-null  int64  
 1   CGPA                       10000 non-null  float64
 2   Internships                10000 non-null  int64  
 3   Projects                   10000 non-null  int64  
 4   Workshops/Certifications   10000 non-null  int64  
 5   AptitudeTestScore          10000 non-null  int64  
 6   SoftSkillsRating           10000 non-null  float64
 7   ExtracurricularActivities  10000 non-null  object 
 8   PlacementTraining          10000 non-null  object 
 9   SSC_Marks                  10000 non-null  int64  
 10  HSC_Marks                  10000 non-null  int64  
 11  PlacementStatus            10000 non-null  object 
dtypes: float64(2), int64(7), object(3)
memory usage: 937.6+ KB


In [8]:
data_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           1987 non-null   int64 
 1   Age                  1987 non-null   int64 
 2   Employment Type      1987 non-null   object
 3   GraduateOrNot        1987 non-null   object
 4   AnnualIncome         1987 non-null   int64 
 5   FamilyMembers        1987 non-null   int64 
 6   ChronicDiseases      1987 non-null   int64 
 7   FrequentFlyer        1987 non-null   object
 8   EverTravelledAbroad  1987 non-null   object
 9   TravelInsurance      1987 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 155.4+ KB


In [9]:
data_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [10]:
y_fitness = data_fitness.loc[:, 'is_fit']
X_fitness = data_fitness.drop(['is_fit'], axis = 1)
y_heart = data_heart.loc[:,'HeartDisease']
X_heart = data_heart.drop(['HeartDisease'], axis = 1)
y_ecommerce = data_ecommerce.loc[:,'Reached.on.Time_Y.N']
X_ecommerce = data_ecommerce.drop(['Reached.on.Time_Y.N'], axis = 1)
y_placement = data_placement.loc[:,'PlacementStatus']
X_placement = data_placement.drop(['PlacementStatus'], axis = 1)
y_insurance = data_insurance.loc[:,'TravelInsurance']
X_insurance = data_insurance.drop(['TravelInsurance'], axis = 1)
y_churn = data_churn.loc[:,'Exited']
X_churn = data_churn.drop(['Exited'], axis = 1)

In [11]:
X_fitness['smokes'] = X_fitness['smokes'].replace({'yes':1, 'no':0}).astype(int)

In [12]:
X_ecommerce = X_ecommerce.drop(['ID'], axis = 1)
X_ecommerce['Gender'] = X_ecommerce['Gender'].replace({'M':1, 'F':0}).astype(int)

  X_ecommerce['Gender'] = X_ecommerce['Gender'].replace({'M':1, 'F':0}).astype(int)


In [13]:
X_placement = X_placement.drop(['StudentID'], axis = 1)
X_placement['ExtracurricularActivities'] = X_placement['ExtracurricularActivities'].replace({'Yes':1, 'No':0}).astype(int)
X_placement['PlacementTraining'] = X_placement['PlacementTraining'].replace({'Yes':1, 'No':0}).astype(int)
y_placement = y_placement.replace({'Placed':1, 'NotPlaced':0}).astype(int)

  X_placement['ExtracurricularActivities'] = X_placement['ExtracurricularActivities'].replace({'Yes':1, 'No':0}).astype(int)
  X_placement['PlacementTraining'] = X_placement['PlacementTraining'].replace({'Yes':1, 'No':0}).astype(int)
  y_placement = y_placement.replace({'Placed':1, 'NotPlaced':0}).astype(int)


In [14]:
X_insurance = X_insurance.drop(['Unnamed: 0'], axis = 1)
X_insurance['GraduateOrNot'] = X_insurance['GraduateOrNot'].replace({'Yes':1, 'No':0}).astype(int)
X_insurance['FrequentFlyer'] = X_insurance['FrequentFlyer'].replace({'Yes':1, 'No':0}).astype(int)
X_insurance['EverTravelledAbroad'] = X_insurance['EverTravelledAbroad'].replace({'Yes':1, 'No':0}).astype(int)


  X_insurance['GraduateOrNot'] = X_insurance['GraduateOrNot'].replace({'Yes':1, 'No':0}).astype(int)
  X_insurance['FrequentFlyer'] = X_insurance['FrequentFlyer'].replace({'Yes':1, 'No':0}).astype(int)
  X_insurance['EverTravelledAbroad'] = X_insurance['EverTravelledAbroad'].replace({'Yes':1, 'No':0}).astype(int)


In [15]:
X_churn = X_churn.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
X_churn['Gender'] = X_churn['Gender'].replace({'Female':1, 'Male':0}).astype(int)

  X_churn['Gender'] = X_churn['Gender'].replace({'Female':1, 'Male':0}).astype(int)


In [16]:
cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

In [17]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

In [18]:
col_transformer = ColumnTransformer(transformers=[
    ('num-pipeline',num_pipeline, make_column_selector(dtype_include=np.number)),
    ('cat-pipeline',cat_pipeline, make_column_selector(dtype_include=np.object_))
])

In [19]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', col_transformer),
    ('model', RandomForestClassifier())
])

In [20]:
cross_validate(model_pipeline, X_heart, y_heart)

{'fit_time': array([0.2440064 , 0.25380373, 0.18061924, 0.17407489, 0.22447062]),
 'score_time': array([0.0212121 , 0.0120039 , 0.01260066, 0.01460385, 0.01197529]),
 'test_score': array([0.91304348, 0.81521739, 0.84782609, 0.84153005, 0.74863388])}

In [21]:
cross_validate(model_pipeline, X_fitness, y_fitness)

{'fit_time': array([0.38510299, 0.35603809, 0.35799885, 0.39331603, 0.36042356]),
 'score_time': array([0.01565456, 0.01501441, 0.0169251 , 0.01600242, 0.01505899]),
 'test_score': array([0.78  , 0.775 , 0.7525, 0.7775, 0.79  ])}

In [22]:
cross_validate(model_pipeline, X_ecommerce, y_ecommerce)

{'fit_time': array([1.13865757, 1.21855974, 1.20355248, 1.02075481, 1.00644732]),
 'score_time': array([0.03598976, 0.04397106, 0.06556582, 0.05217552, 0.05131912]),
 'test_score': array([0.79227273, 0.80545455, 0.59363636, 0.41727273, 0.42428377])}

In [23]:
cross_validate(model_pipeline, X_placement, y_placement)

{'fit_time': array([0.81025505, 0.83256054, 0.81821609, 0.83361959, 0.84982443]),
 'score_time': array([0.05111909, 0.04384136, 0.04000616, 0.04120994, 0.04199433]),
 'test_score': array([0.781, 0.782, 0.796, 0.791, 0.79 ])}

In [24]:
cross_validate(model_pipeline, X_insurance, y_insurance)

{'fit_time': array([0.21350408, 0.23848295, 0.22204638, 0.21445537, 0.22395611]),
 'score_time': array([0.01467896, 0.01608467, 0.01568985, 0.01493382, 0.0189991 ]),
 'test_score': array([0.77889447, 0.82160804, 0.80352645, 0.76574307, 0.78085642])}

In [25]:
cross_validate(model_pipeline, X_churn, y_churn)

{'fit_time': array([1.03286338, 1.03712225, 1.0444057 , 1.04102397, 1.07424164]),
 'score_time': array([0.04308367, 0.05000615, 0.03799534, 0.03968143, 0.04693437]),
 'test_score': array([0.8635, 0.871 , 0.863 , 0.867 , 0.862 ])}

In [26]:
X_fitness_train, X_fitness_test, y_fitness_train, y_fitness_test = train_test_split(X_fitness, y_fitness, test_size=0.3, random_state=123)
X_heart_train, X_heart_test, y_heart_train, y_heart_test = train_test_split(X_heart, y_heart, test_size=0.3, random_state=123)
X_placement_train, X_placement_test, y_placement_train, y_placement_test = train_test_split(X_placement, y_placement, test_size=0.3, random_state=123)
X_insurance_train, X_insurance_test, y_insurance_train, y_insurance_test = train_test_split(X_insurance, y_insurance, test_size=0.3, random_state=123)
X_churn_train, X_churn_test, y_churn_train, y_churn_test = train_test_split(X_churn, y_churn, test_size=0.3, random_state=123)

In [27]:
X_train = [X_fitness_train, X_heart_train, X_placement_train, X_insurance_train, X_churn_train]
X_test = [X_fitness_test, X_heart_test, X_placement_test, X_insurance_test, X_churn_test]
y_train = [y_fitness_train, y_heart_train, y_placement_train, y_insurance_train, y_churn_train]
y_test = [y_fitness_test, y_heart_test, y_placement_test, y_insurance_test, y_churn_test]

In [59]:
param_dtc = {
    "model__max_depth": randint(1, 30),
    "model__min_samples_split": randint(2, 20),
    "model__min_samples_leaf": randint(1, 10),
    "model__max_features": uniform(0.1, 1.0),
}

param_rf = {
    "model__n_estimators": randint(100, 2000),
    "model__max_depth": randint(5, 30),
    "model__min_samples_leaf": randint(1, 10),
    "model__max_features": uniform(0.1, 1.0),
}

param_xgb = {
    "model__n_estimators": randint(100, 5000),
    "model__learning_rate": uniform(0.01, 0.3),
    "model__max_depth": randint(1, 15),
    "model__subsample": uniform(0.1, 0.9),
}

In [30]:
results_list = []

model_pipeline.set_params(model=DecisionTreeClassifier())

for i, (X, y) in enumerate(zip(X_train, y_train)):
    search = RandomizedSearchCV(
        model_pipeline,
        param_distributions=param_dtc,
        n_iter=30,
        scoring='roc_auc',
        cv=5,
        random_state=123,
        n_jobs=-1
    )

    search.fit(X, y)

    df = pd.DataFrame(search.cv_results_)
    df['dataset'] = i
    results_list.append(df)

results_dtc = pd.concat(results_list, ignore_index=True)

In [None]:
param_cols_dtc = [c for c in results_dtc.columns if c.startswith('param_')]
results_dtc.groupby(param_cols_dtc)['mean_test_score'].mean().reset_index().sort_values('mean_test_score', ascending=False).reset_index(drop=True).head()

Unnamed: 0,param_model__max_depth,param_model__max_features,param_model__min_samples_leaf,param_model__min_samples_split,mean_test_score
0,9,0.607204,9,15,0.823954
1,5,0.498044,5,19,0.817453
2,15,0.597309,8,13,0.813268
3,4,0.498186,8,8,0.812132
4,8,0.824455,5,8,0.812127


TODO: 
Dla każdegeo datasetu wziąć rocauc z najlepszych parametrow (ten co ma teraz mean_test_score 0.82) i policzyć różnice między rocauc dla tego najlepszego ustawienia hiperparametrów a rocauc dla każdego innego ustawienia. Do tego jakiś wykres.

In [32]:
results_list = []

model_pipeline.set_params(model=RandomForestClassifier())

for i, (X, y) in enumerate(zip(X_train, y_train)):
    search = RandomizedSearchCV(
        model_pipeline,
        param_distributions=param_rf,
        n_iter=10,
        scoring='roc_auc',
        cv=5,
        random_state=123,
        n_jobs=-1
    )

    search.fit(X, y)

    df = pd.DataFrame(search.cv_results_)
    df['dataset'] = i
    results_list.append(df)

results_rf = pd.concat(results_list, ignore_index=True)

In [33]:
param_cols_rf = [c for c in results_rf.columns if c.startswith('param_')]
results_rf.groupby(param_cols_rf)['mean_test_score'].mean().reset_index().sort_values('mean_test_score', ascending=False).reset_index(drop=True).head()

Unnamed: 0,param_model__max_depth,param_model__max_features,param_model__min_samples_leaf,param_model__n_estimators,mean_test_score
0,9,0.498044,5,1941,0.861722
1,7,0.631828,9,1300,0.861139
2,18,0.386139,3,1866,0.859433
3,6,0.754721,3,688,0.85856
4,5,0.679694,1,1042,0.858379


In [60]:
results_list = []

model_pipeline.set_params(model=XGBClassifier())

for i, (X, y) in enumerate(zip(X_train, y_train)):
    search = RandomizedSearchCV(
        model_pipeline,
        param_distributions=param_xgb,
        n_iter=10,
        scoring='roc_auc',
        cv=5,
        random_state=123,
        n_jobs=-1
    )

    search.fit(X, y)

    df = pd.DataFrame(search.cv_results_)
    df['dataset'] = i
    results_list.append(df)

results_xgb = pd.concat(results_list, ignore_index=True)

In [61]:
param_cols_xgb = [c for c in results_xgb.columns if c.startswith('param_')]
results_xgb.groupby(param_cols_xgb)['mean_test_score'].mean().reset_index().sort_values('mean_test_score', ascending=False).reset_index(drop=True).head()

Unnamed: 0,param_model__learning_rate,param_model__max_depth,param_model__n_estimators,param_model__subsample,mean_test_score
0,0.142077,5,139,0.741097,0.851732
1,0.107245,14,1192,0.668613,0.840957
2,0.138359,14,1259,0.75201,0.840467
3,0.215449,10,1042,0.664585,0.837784
4,0.199293,11,937,0.490331,0.836064


TODO: ilość iteracji, Bayes