## Import libraries

In [1]:
try:
    from google.colab import drive

    IN_COLAB = True
    drive.mount('/content/drive')
    !pip install -qU python-dotenv scikit-learn xgboost==1.5.0
    %cd /content/drive/MyDrive/Best ML model ever/

except:
    IN_COLAB = False
    # Set working directory
    %cd ..
!pwd

/mnt/d/Google Drive/Best ML model ever
/mnt/d/Google Drive/Best ML model ever


In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split

from src.train_search_best_params import preprocess
from src import model_dispatcher
from collections import namedtuple
import xgboost

import warnings
warnings.filterwarnings("ignore") 


## Load the data

In [3]:
try:
    from google.colab import drive

    IN_COLAB = True
    drive.mount('/content/drive')
    path_data = "input/dri_wat_pot_folds.csv"
    path_test = "input/dri_wat_pot_test.csv"
    path_best_models = "logs/results_all_models.csv"
except:
    IN_COLAB = False
    path_data = "./input/dri_wat_pot_folds.csv"
    path_test = "./input/dri_wat_pot_test.csv"
    path_best_models = "./logs/results_all_models.csv"

In [4]:
df = pd.read_csv(path_data)
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
nb_tot_rows = df.shape[0]
df_features = df.drop("Potability", axis=1)
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability,kfold
0,6.799376,239.057680,7986.493239,10.365680,419.881175,373.232137,18.828594,43.440644,3.819985,1,3
1,5.498515,180.825114,21542.830030,6.707095,352.250711,419.512958,13.183432,68.904370,3.074815,1,2
2,7.386582,191.585566,26351.903770,8.426161,,505.187929,18.925674,72.649614,3.791373,1,4
3,6.783888,193.653581,13677.106440,5.171454,323.728663,477.854687,15.056064,,3.250022,0,1
4,7.137429,210.502749,17506.608800,7.304928,301.642004,304.239481,13.076007,64.230942,2.964181,1,2
...,...,...,...,...,...,...,...,...,...,...,...
2615,6.578681,203.408816,22374.824910,6.248929,399.617217,547.702137,12.097920,71.338489,4.292042,1,1
2616,8.922380,278.619448,21963.476000,8.105638,334.053693,385.874799,8.803475,,5.821826,1,1
2617,,233.063426,36640.371370,10.000218,435.115189,337.664412,16.635476,61.073710,4.849755,1,2
2618,,206.318342,18752.269680,8.202453,344.589628,510.487880,5.362371,61.911364,3.293553,0,0


# Loading best learners

In [5]:
pd.options.display.max_colwidth = 100
leaderboard = pd.read_csv(path_best_models)
leaderboard = leaderboard.sort_values('f1_score', ascending=False)
leaderboard.head(10)

Unnamed: 0,model_name,preprocessing_params,model_params,accuracy,auc,f1_score
1862,xgb,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""alpha"": 0, ""lambda"": 0}",0.670588,0.690938,0.55836
1808,xgb,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""alpha"": 0, ""lambda"": 0}",0.670588,0.690938,0.55836
3505,svm,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": true, ""poly_degree"": 2}","{""class_weight"": ""balanced"", ""kernel"": ""rbf""}",0.645201,0.67865,0.554005
3521,svm,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": false, ""poly_degree"": 1}","{""class_weight"": ""balanced"", ""kernel"": ""rbf""}",0.647059,0.684065,0.552465
3513,svm,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": true, ""poly_degree"": 3}","{""class_weight"": ""balanced"", ""kernel"": ""rbf""}",0.642724,0.676768,0.552388
3296,svm,"{""missing"": ""mean"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""class_weight"": ""balanced"", ""kernel"": ""poly""}",0.554198,0.644483,0.551999
3584,svm,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""class_weight"": ""balanced"", ""kernel"": ""poly""}",0.647059,0.688336,0.551467
3537,svm,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""class_weight"": ""balanced"", ""kernel"": ""rbf""}",0.641486,0.67618,0.551286
3576,svm,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 2}","{""class_weight"": ""balanced"", ""kernel"": ""poly""}",0.650774,0.679562,0.551006
3824,svm,"{""missing"": ""knn"", ""scaling"": ""standard"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""class_weight"": ""balanced"", ""kernel"": ""poly""}",0.50229,0.596897,0.551004


In [6]:
# take best performing model of each type excluding the "constant" model
best_models = leaderboard.drop_duplicates('model_name')[leaderboard['model_name']!='constant']
best_models

Unnamed: 0,model_name,preprocessing_params,model_params,accuracy,auc,f1_score
1862,xgb,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""alpha"": 0, ""lambda"": 0}",0.670588,0.690938,0.55836
3505,svm,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": true, ""poly_degree"": 2}","{""class_weight"": ""balanced"", ""kernel"": ""rbf""}",0.645201,0.67865,0.554005
616,dt_gini,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 1}","{""class_weight"": null, ""splitter"": ""random""}",0.604334,0.596781,0.531293
1255,rf,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": true, ""poly_degree"": 3}","{""class_weight"": null, ""criterion"": ""gini""}",0.673065,0.704062,0.527397
951,dt_entropy,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 2}","{""class_weight"": null, ""splitter"": ""best""}",0.609288,0.598242,0.527131
2695,log_reg,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""penalty"": ""none"", ""solver"": ""lbfgs""}",0.670588,0.69291,0.518937
2384,extratrees,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 3}","{""class_weight"": null, ""criterion"": ""entropy""}",0.67678,0.696226,0.513428
5078,knn,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": true, ""poly_degree"": 1}","{""n_neighbors"": 3, ""p"": 2, ""weights"": ""distance""}",0.620433,0.62358,0.493799
3022,svm_calib,"{""missing"": ""remove_rows"", ""scaling"": ""standard"", ""add_Solids_log"": true, ""poly_degree"": 3}","{""ensemble"": false, ""method"": ""isotonic""}",0.666254,0.683332,0.473013
4062,naive_bayes,"{""missing"": ""remove_rows"", ""scaling"": ""min_max"", ""add_Solids_log"": false, ""poly_degree"": 2}","{""var_smoothing"": 1e-05}",0.60743,0.600581,0.459473


In [7]:
Learner = namedtuple('Learner', ['pre_pipeline', 'model'])
learners = []
for ix, row in best_models.iterrows():
    model_name, preprocessing_params, model_params = row[['model_name', 'preprocessing_params', 'model_params']]
    preprocessing_params = json.loads(preprocessing_params)
    model_params = json.loads(model_params)

    # preprocess data
    print(f"\nPreprocessing... {preprocessing_params}")
    X = df.drop(["Potability", "kfold"], axis=1, errors='ignore').values
    y = df.Potability.values
    X, y, pre_pipeline = preprocess(X, y, model_name, preprocess_params=preprocessing_params, save=False, return_pipeline=True)
    print(f"Preprocessing results: X of shape {X.shape}")

    # load model
    model = model_dispatcher.models[model_name]["model"](**model_dispatcher.models[model_name]["base_model_params"])
    model.set_params(**model_params)
    print(model)

    # train model
    model.fit(X, y)

    # save model
    learners.append(Learner(pre_pipeline, model))


Preprocessing... {'missing': 'remove_rows', 'scaling': 'min_max', 'add_Solids_log': False, 'poly_degree': 3}
Preprocessing results: X of shape (1615, 228)
XGBClassifier(alpha=0, base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None, lambda=0,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=42, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

Preprocessing... {'missing': 'remove_rows', 'scaling': 'standard', 'add_Solids_log': True, 'poly_degree': 2}
Preprocessing results: X of shape

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Preprocessing... {'missing': 'remove_rows', 'scaling': 'min_max', 'add_Solids_log': False, 'poly_degree': 3}
Preprocessing results: X of shape (1615, 228)
ExtraTreesClassifier(criterion='entropy', n_jobs=-1, random_state=42)

Preprocessing... {'missing': 'remove_rows', 'scaling': 'standard', 'add_Solids_log': True, 'poly_degree': 1}
Preprocessing results: X of shape (1615, 10)
KNeighborsClassifier(n_jobs=-1, n_neighbors=3, weights='distance')

Preprocessing... {'missing': 'remove_rows', 'scaling': 'standard', 'add_Solids_log': True, 'poly_degree': 3}
Preprocessing results: X of shape (1615, 295)
CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000, random_state=42),
                       ensemble=False, method='isotonic', n_jobs=-1)





Preprocessing... {'missing': 'remove_rows', 'scaling': 'min_max', 'add_Solids_log': False, 'poly_degree': 2}
Preprocessing results: X of shape (1615, 63)
GaussianNB(var_smoothing=1e-05)


# Linear Regression Stacking

In [8]:
# remove SVC because it doesn't support predict_proba method
learners = [l for l in learners if not l.model.__str__().startswith('SVC')]

In [9]:
df_2 = pd.read_csv(path_test)

df_f_train, df_f_test = train_test_split(df_2, test_size=0.2, stratify=df_2["Potability"], random_state=42)
X_f_train = df_f_train.drop(["Potability"], axis=1, errors='ignore').values
y_f_train = df_f_train.Potability.values
X_f_test = df_f_test.drop(["Potability"], axis=1, errors='ignore').values
y_f_test = df_f_test.Potability.values

df_pred_train = pd.DataFrame()
df_pred_test = pd.DataFrame()
X_f_train.shape

(524, 9)

In [10]:
for i, (pre_pipeline, model) in enumerate(learners):
    X_train_processed, y_train_processed = pre_pipeline.transform(X_f_train, y_f_train) # TODO: case where we have different X_processed shapes
    X_test_processed, y_test_processed = pre_pipeline.transform(X_f_test, y_f_test)
    try:
        pred_train = model.predict_proba(X_train_processed)
        pred_test = model.predict_proba(X_test_processed)
    except:
        print(f"error: {model.__class__}")
        continue
    
    print(pred_train.shape)
    df_pred_train[f'pred{i}'] = pred_train[:,1]
    df_pred_test[f'pred{i}'] = pred_test[:,1]


(319, 2)
(319, 2)
(319, 2)
(319, 2)
(319, 2)
(319, 2)
(319, 2)
(319, 2)
(319, 2)


In [11]:
df_pred_train

Unnamed: 0,pred0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8
0,0.013048,0.0,0.40,0.0,0.006837,0.26,0.311085,0.095238,0.488135
1,0.016813,0.0,0.33,0.0,0.312238,0.25,0.000000,0.380550,0.358305
2,0.822755,0.0,0.61,1.0,0.533802,0.63,0.654820,0.466667,0.158960
3,0.044083,1.0,0.31,0.0,0.264065,0.22,0.258578,0.380550,0.069442
4,0.881141,1.0,0.60,1.0,0.626226,0.58,0.665484,0.606061,0.119728
...,...,...,...,...,...,...,...,...,...
314,0.983991,0.0,0.79,1.0,0.767570,0.62,0.000000,0.625000,0.498152
315,0.011684,0.0,0.34,0.0,0.421307,0.36,0.000000,0.380550,0.198414
316,0.456596,0.0,0.57,1.0,0.495442,0.53,0.636145,0.380550,0.042659
317,0.593889,0.0,0.42,0.0,0.223857,0.41,0.000000,0.309645,0.966709


In [12]:
df_pred_train['Potability'] = y_train_processed
df_pred_test['Potability'] = y_test_processed
df_pred_train

Unnamed: 0,pred0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,Potability
0,0.013048,0.0,0.40,0.0,0.006837,0.26,0.311085,0.095238,0.488135,0
1,0.016813,0.0,0.33,0.0,0.312238,0.25,0.000000,0.380550,0.358305,1
2,0.822755,0.0,0.61,1.0,0.533802,0.63,0.654820,0.466667,0.158960,0
3,0.044083,1.0,0.31,0.0,0.264065,0.22,0.258578,0.380550,0.069442,0
4,0.881141,1.0,0.60,1.0,0.626226,0.58,0.665484,0.606061,0.119728,0
...,...,...,...,...,...,...,...,...,...,...
314,0.983991,0.0,0.79,1.0,0.767570,0.62,0.000000,0.625000,0.498152,1
315,0.011684,0.0,0.34,0.0,0.421307,0.36,0.000000,0.380550,0.198414,1
316,0.456596,0.0,0.57,1.0,0.495442,0.53,0.636145,0.380550,0.042659,1
317,0.593889,0.0,0.42,0.0,0.223857,0.41,0.000000,0.309645,0.966709,0


In [13]:
# create 5 folds
from sklearn.model_selection import StratifiedKFold

df_pred_train.loc[:, "kfold"] = -1
kf = StratifiedKFold(n_splits=5)
for f, (t_, v_) in enumerate(kf.split(X=df_pred_train, y=df_pred_train['Potability'])):
    df_pred_train.loc[v_, 'kfold'] = f
df_pred_train

Unnamed: 0,pred0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,Potability,kfold
0,0.013048,0.0,0.40,0.0,0.006837,0.26,0.311085,0.095238,0.488135,0,0
1,0.016813,0.0,0.33,0.0,0.312238,0.25,0.000000,0.380550,0.358305,1,0
2,0.822755,0.0,0.61,1.0,0.533802,0.63,0.654820,0.466667,0.158960,0,0
3,0.044083,1.0,0.31,0.0,0.264065,0.22,0.258578,0.380550,0.069442,0,0
4,0.881141,1.0,0.60,1.0,0.626226,0.58,0.665484,0.606061,0.119728,0,0
...,...,...,...,...,...,...,...,...,...,...,...
314,0.983991,0.0,0.79,1.0,0.767570,0.62,0.000000,0.625000,0.498152,1,4
315,0.011684,0.0,0.34,0.0,0.421307,0.36,0.000000,0.380550,0.198414,1,4
316,0.456596,0.0,0.57,1.0,0.495442,0.53,0.636145,0.380550,0.042659,1,4
317,0.593889,0.0,0.42,0.0,0.223857,0.41,0.000000,0.309645,0.966709,0,4


In [14]:
# fit linear regression on preds
from sklearn.linear_model import LinearRegression
from src.evaluate import calculate_metrics

df = df_pred_train
final_predictions = []
scores = []
metalearners = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_pred_test

    ytrain = xtrain.Potability
    yvalid = xvalid.Potability

    xtrain = xtrain.drop(["Potability", "kfold"], axis=1)
    xvalid = xvalid.drop(["Potability", "kfold"], axis=1)
    xtest = xtest.drop(["Potability"], axis=1)
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    print(xtrain.head())
    print(ytrain)
    metalearners.append(model)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_predictions.append(test_preds)
    score = calculate_metrics(yvalid, preds_valid)
    print(fold, score)
    scores.append(score)

      pred0  pred1  pred2  pred3     pred4  pred5     pred6     pred7  \
0  0.293031    1.0   0.32    0.0  0.223038   0.30  0.330968  0.309645   
1  0.004630    0.0   0.27    0.0  0.363435   0.21  0.302515  0.380550   
2  0.128731    1.0   0.33    1.0  0.449317   0.41  0.679241  0.380550   
3  0.051113    0.0   0.33    0.0  0.256540   0.29  0.000000  0.451613   
4  0.830738    1.0   0.50    1.0  0.766601   0.59  0.701215  0.756757   

      pred8  
0  0.019417  
1  0.079529  
2  0.096825  
3  0.086390  
4  0.564191  
0      0
1      0
2      0
3      0
4      1
      ..
250    1
251    1
252    1
253    0
254    0
Name: Potability, Length: 255, dtype: int64
0 {'accuracy': 0.71875, 'f1_score': 0.5909090909090909, 'auc': 0.7651282051282052}
      pred0  pred1  pred2  pred3     pred4  pred5     pred6     pred7  \
0  0.013048    0.0   0.40    0.0  0.006837   0.26  0.311085  0.095238   
1  0.016813    0.0   0.33    0.0  0.312238   0.25  0.000000  0.380550   
2  0.822755    0.0   0.61    1.0

In [15]:
pred_probs = np.mean(np.column_stack(final_predictions), axis=1)
print(pred_probs.shape)
calculate_metrics(df_pred_test.Potability.values, pred_probs)


(77,)


{'accuracy': 0.7272727272727273,
 'f1_score': 0.6181818181818182,
 'auc': 0.7580645161290323}

# Final Prediction Pipeline

In [16]:
preds1 = []
for pre_pipeline, model in learners:
    X_processed, _ = pre_pipeline.transform(X_f_test)
    preds1.append(model.predict_proba(X_processed)[:,1])
preds1 = np.column_stack(preds1)
preds1.shape

(77, 9)

In [17]:
pd.DataFrame(preds1).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,0.411083,0.493506,0.432208,0.376623,0.419952,0.420779,0.375956,0.421455,0.423378
std,0.394947,0.503236,0.188956,0.487717,0.246519,0.171163,0.297043,0.169824,0.367363
min,0.000563,0.0,0.03,0.0,0.051627,0.07,0.0,0.183099,0.005651
25%,0.048433,0.0,0.28,0.0,0.25325,0.31,0.0,0.309645,0.100681
50%,0.284588,0.0,0.41,0.0,0.403268,0.42,0.339147,0.38055,0.229177
75%,0.882631,1.0,0.57,1.0,0.537167,0.53,0.659495,0.606061,0.829658
max,0.994848,1.0,0.87,1.0,0.982495,0.83,1.0,0.756757,0.99999


In [18]:
final_preds = np.column_stack(final_predictions)
print(final_preds.shape)
final_preds = np.mean(final_preds, axis=1)
final_preds = np.round(final_preds).astype(int)

(77, 5)


In [19]:
pd.DataFrame(final_preds).describe()

Unnamed: 0,0
count,77.0
mean,0.311688
std,0.466221
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [20]:
df_f_test = df_f_test.dropna().reset_index(drop=True)
df_f_test['pred'] = final_preds
df_f_test

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability,pred
0,7.581688,180.749140,11989.246240,4.977307,328.176978,617.883513,13.561253,39.215917,4.457282,0,0
1,8.558389,198.888868,8535.402956,4.796666,321.403477,453.571163,14.351624,74.326204,3.946228,0,0
2,6.934311,228.194476,13901.644680,6.045503,281.552117,403.215091,17.811900,66.779338,4.210975,0,0
3,7.078659,232.581848,26177.789510,3.914353,334.161127,380.459918,9.633631,41.951940,3.696640,0,0
4,6.334163,197.482326,23460.821380,6.121464,303.550324,399.563786,18.822216,62.222875,4.038830,1,0
...,...,...,...,...,...,...,...,...,...,...,...
72,6.412707,204.358097,32333.841540,5.204084,345.717500,389.215295,11.145040,91.204867,5.014145,0,0
73,5.842755,202.044775,13969.261560,6.643918,290.795954,475.142956,18.189839,77.931104,4.490393,1,0
74,7.817901,221.089708,13742.145970,6.373737,287.698481,460.599214,12.704827,66.777600,4.622668,0,0
75,7.775386,193.077168,15704.482090,7.881197,324.336203,301.753476,13.378165,89.051957,3.309472,0,0


In [21]:
# number of correct classifications
(final_preds == df_pred_test['Potability']).sum()

56

In [22]:
class StackingClf():
    def __init__(self, learners, metalearners):
        self.learners = learners
        self.metalearners = metalearners
    def predict_proba(self, X: np.array):
        """ Predicts proba
        Arguments : X of shape (-1,9) 
        Returns : pred_probas of shape (-1,1) 
        """
        # Bootstrapping
        preds1 = []
        for pre_pipeline, model in self.learners:
            X_processed, _ = pre_pipeline.transform(X)
            preds1.append(model.predict_proba(X_processed)[:,1])
        preds1 = np.column_stack(preds1)

        # Aggregating
        final_preds = []
        for model in self.metalearners:
            final_preds.append(model.predict(preds1))
        final_preds = np.column_stack(final_preds)
        final_preds = np.mean(final_preds, axis=1)
        return final_preds

    def predict(self, X: np.array):
        """ Predicts class
        Arguments : X of shape (-1,9) 
        Returns : predicted_class of shape (-1,1) 
        """
        preds_probs = self.predict_proba(X)
        predicted_class = np.round(preds_probs).astype(int)
        return predicted_class

In [23]:
clf = StackingClf(learners, metalearners)

In [28]:
# evaluate on test
X = df_f_test.drop(["Potability", "kfold", "pred"], axis=1, errors='ignore')
y = df_f_test.Potability.values
preds_probs = clf.predict_proba(X)
calculate_metrics(df_f_test.dropna()['Potability'].values, preds_probs)

{'accuracy': 0.7272727272727273,
 'f1_score': 0.6181818181818182,
 'auc': 0.7580645161290323}

In [33]:
# evaluate on train
X = df_f_train.drop(["Potability", "kfold", "pred"], axis=1, errors='ignore')
y = df_f_train.Potability.values
preds_probs = clf.predict_proba(X)
calculate_metrics(df_f_train.dropna()['Potability'].values, preds_probs)

{'accuracy': 0.7366771159874608,
 'f1_score': 0.6074766355140188,
 'auc': 0.779463771691751}

In [26]:
from src.utils import save_file
from src import config, model_dispatcher
import pickle
model_name = 'stacking'
fold = -1

# save model
# save_file(clf, f"{config.SAVED_MODELS}/{model_name}/{model_name}_{fold}.bin")
with open(f"{config.SAVED_MODELS}/{model_name}/{model_name}_{fold}.bin", 'wb') as f:
    pickle.dump(clf, f)

In [27]:
class StackingPrePipeline():
    def __init__(self):
        pass
    def transform(self, X):
        return X

# save preprocessing pipeline
save_file(
    StackingPrePipeline, f"{config.SAVED_MODELS}/{model_name}/{model_name}_{fold}_preprocess.pkl")