<a href="https://colab.research.google.com/github/SIDIBEMoussa/LoanPrediction_Zindi_Competition/blob/main/Simple_feature_eng_Starter_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Africa Credit Challenge

The goals of this `starter notebook` are:

1. How to load data into a `Pandas` dataframe

2. How to `train` a basic machine learning model

3. How to `evaluate` the model

4. How to `save` your __submission__ following the required format.

<font color="red"><b>Note: This is just a starter notebook. It is by no means an ideal solution.</b></font>

In [210]:
from warnings import filterwarnings
filterwarnings('ignore')

In [211]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [212]:
import os

os.chdir("/content/drive/MyDrive/Data_Zindi_Loan/")

In [213]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

## Load and Inspect the data

In [214]:
# Loading the train dataset
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
# Display the first few rows of the datasets and their shape
train.head()

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,7,Repeat Loan,120.85,0.014305,121.0,0
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,7,Repeat Loan,7768.5,0.3,7794.0,0
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,7,Repeat Loan,1380.0,0.2,1428.0,0
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,7,Repeat Loan,2687.4,0.3,2770.0,0
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,7,Repeat Loan,1369.2,0.3,1418.0,0


In [215]:
train["disbursement_date"] = pd.to_datetime(train["disbursement_date"])
test["disbursement_date"] = pd.to_datetime(test["disbursement_date"])
train["due_date"] = pd.to_datetime(train["due_date"])
test["due_date"] = pd.to_datetime(test["due_date"])

In [216]:
test.groupby("country_id").count()

Unnamed: 0_level_0,ID,customer_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Ghana,3525,3525,3525,3525,3525,3525,3525,3525,3525,3525,3525,3525,3525,3525
Kenya,15069,15069,15069,15069,15069,15069,15069,15069,15069,15069,15069,15069,15069,15069


In [217]:
# Are there missing values in the train dataset ?
print(f"There are {train.isna().sum().sum()} missing values in the data.")

There are 0 missing values in the data.


In [218]:
indicator_data = pd.read_csv('economic_indicators.csv')
indicator_data.head(2)

Unnamed: 0,Country,Indicator,YR2001,YR2002,YR2003,YR2004,YR2005,YR2006,YR2007,YR2008,...,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019,YR2020,YR2021,YR2022,YR2023
0,Ghana,"Inflation, consumer prices (annual %)",41.509496,9.360932,29.77298,18.042739,15.438992,11.679184,10.734267,16.49464,...,15.489616,17.14997,17.454635,12.371922,7.808765,7.14364,9.88729,9.971089,31.255895,38.106966
1,Cote d'Ivoire,"Inflation, consumer prices (annual %)",4.361529,3.077265,3.296807,1.457988,3.88583,2.467191,1.892006,6.308528,...,0.448682,1.2515,0.723178,0.685881,0.359409,-1.106863,2.425007,4.091952,5.276167,4.387117


In [219]:
kenya_indicator = indicator_data.loc[indicator_data.Country == 'Kenya']
ghana_indicator = indicator_data.loc[indicator_data.Country == 'Ghana']

In [220]:
kenya_indicator.head(1)

Unnamed: 0,Country,Indicator,YR2001,YR2002,YR2003,YR2004,YR2005,YR2006,YR2007,YR2008,...,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019,YR2020,YR2021,YR2022,YR2023
2,Kenya,"Inflation, consumer prices (annual %)",5.738598,1.961308,9.815691,11.624036,10.312778,14.453734,9.75888,26.239817,...,6.878155,6.582154,6.29725,8.00565,4.689806,5.239638,5.405162,6.107936,7.659863,7.671396


In [221]:
kenya_indicator.columns = list(map(lambda x: x[2:], indicator_data.columns))
ghana_indicator.columns = list(map(lambda x: x[2:], indicator_data.columns))

In [222]:
kenya_indicator.head(1)

Unnamed: 0,untry,dicator,2001,2002,2003,2004,2005,2006,2007,2008,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
2,Kenya,"Inflation, consumer prices (annual %)",5.738598,1.961308,9.815691,11.624036,10.312778,14.453734,9.75888,26.239817,...,6.878155,6.582154,6.29725,8.00565,4.689806,5.239638,5.405162,6.107936,7.659863,7.671396


In [223]:
def get_dict(kenya_indicator):
    kenya_indicator_dict = {}
    for indicator_name in kenya_indicator.dicator:
        for year in kenya_indicator.columns[2:]:
            if year>"2020":
                kenya_indicator_dict[indicator_name+year] = kenya_indicator.loc[kenya_indicator.dicator == indicator_name, year].values[0]

    return kenya_indicator_dict

kenya_indicator_dict = get_dict(kenya_indicator)
ghana_indicator_dict = get_dict(ghana_indicator)

In [224]:
kenya_indicator_dict

{'Inflation, consumer prices (annual %)2021': 6.10793603659176,
 'Inflation, consumer prices (annual %)2022': 7.65986268272224,
 'Inflation, consumer prices (annual %)2023': 7.67139634029402,
 'Official exchange rate (LCU per US$, period average)2021': 109.637746592367,
 'Official exchange rate (LCU per US$, period average)2022': 117.865989201683,
 'Official exchange rate (LCU per US$, period average)2023': 139.846383759617,
 'Real interest rate (%)2021': 7.42758100514372,
 'Real interest rate (%)2022': 5.96844687503544,
 'Real interest rate (%)2023': 6.54651706101945,
 'Average precipitation in depth (mm per year)2021': 630.0,
 'Average precipitation in depth (mm per year)2022': nan,
 'Average precipitation in depth (mm per year)2023': nan,
 'Deposit interest rate (%)2021': 6.68697378755183,
 'Deposit interest rate (%)2022': 7.1451755018279,
 'Deposit interest rate (%)2023': 9.16769017629068,
 'Lending interest rate (%)2021': 12.0799983467507,
 'Lending interest rate (%)2022': 12.3358

In [225]:
#def enrich_data(indicator_name,disbursement_date,indicator_dict):

#    if float(indicator_name[-4:]) == float(disbursement_date.year):

#        return indicator_dict[indicator_name]
 #   else:
  #      return np.nan


In [226]:
def create_date_feature(df):
    df['year'] = df['disbursement_date'].dt.year
    df['month'] = df['disbursement_date'].dt.month
    df['day'] = df['disbursement_date'].dt.day
    df['day_of_week'] = df['disbursement_date'].dt.dayofweek
    df['quarter'] = df['disbursement_date'].dt.quarter
    return df
train = create_date_feature(train)
test = create_date_feature(test)

In [227]:
train.head()

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,...,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target,year,month,day,day_of_week,quarter
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,...,Repeat Loan,120.85,0.014305,121.0,0,2022,8,30,1,3
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,...,Repeat Loan,7768.5,0.3,7794.0,0,2022,7,30,5,3
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,...,Repeat Loan,1380.0,0.2,1428.0,0,2024,9,6,4,3
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,...,Repeat Loan,2687.4,0.3,2770.0,0,2022,10,20,3,4
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,...,Repeat Loan,1369.2,0.3,1418.0,0,2022,11,28,0,4


In [228]:
def extrat_value(indicator_name, disbursement_date, indicator_dict):
    if indicator_name in indicator_dict.keys():
        year = str(disbursement_date.year)
        if year in indicator_name:
          return indicator_dict[indicator_name]
        else:
          return np.nan
    else:
      return np.nan

In [229]:
def richest_train(train,kenya_indicator):
    disbursement_date = train.disbursement_date
    kenya_master_dict = {}
    for indicator_name in kenya_indicator_dict.keys():
        kenya_master_dict[indicator_name[:-4]] = [] #this line is the problem

    for indicator_name in kenya_master_dict.keys(): # you iterate here
        for date in disbursement_date:
          year = str(date.year)
          kenya_master_dict[indicator_name].append(extrat_value(indicator_name+year, date, kenya_indicator_dict)) # and still modify it here
    return kenya_master_dict

kenya_master_dict = richest_train(train,kenya_indicator)
train_ind_data = pd.DataFrame(kenya_master_dict)
train = pd.concat([train, train_ind_data], axis=1)

In [230]:
ghana_df = test.loc[test.country_id == 'Ghana']
kenya_df = test.loc[test.country_id == 'Kenya']
ghana_master = richest_train(ghana_df, ghana_indicator)

kenya_master = richest_train(kenya_df, kenya_indicator)
ghana_master_df = pd.DataFrame(ghana_master)
kenya_master_df = pd.DataFrame(kenya_master)

In [231]:
ghana_master_df.isna().sum()

Unnamed: 0,0
"Inflation, consumer prices (annual %)",0
"Official exchange rate (LCU per US$, period average)",0
Real interest rate (%),0
Average precipitation in depth (mm per year),3506
Deposit interest rate (%),0
Lending interest rate (%),0
"Interest rate spread (lending rate minus deposit rate, %)",0
Fossil fuel energy consumption (% of total),3525
Unemployment rate,0


In [232]:
ghana_df["id"] = range(3525)
ghana_master_df["id"] = range(3525)

ghana_df = pd.merge(ghana_df, ghana_master_df, on="id", how="left")
ghana_df = ghana_df.drop("id", axis=1)

In [233]:
kenya_df["id"] = range(15069)
kenya_master_df["id"] = range(15069)

kenya_df = pd.merge(kenya_df, kenya_master_df, on="id", how="left")
kenya_df = kenya_df.drop("id", axis=1)

In [234]:
kenya_df.isna().sum()

Unnamed: 0,0
ID,0
customer_id,0
country_id,0
tbl_loan_id,0
lender_id,0
loan_type,0
Total_Amount,0
Total_Amount_to_Repay,0
disbursement_date,0
due_date,0


In [235]:
ghana_df.isna().sum()

Unnamed: 0,0
ID,0
customer_id,0
country_id,0
tbl_loan_id,0
lender_id,0
loan_type,0
Total_Amount,0
Total_Amount_to_Repay,0
disbursement_date,0
due_date,0


In [236]:
d = {'Ghana': ghana_df, 'Kenya': kenya_df}
test = pd.concat(d.values())

In [237]:
test.isna().sum()

Unnamed: 0,0
ID,0
customer_id,0
country_id,0
tbl_loan_id,0
lender_id,0
loan_type,0
Total_Amount,0
Total_Amount_to_Repay,0
disbursement_date,0
due_date,0


In [238]:
def fillna_k(train, test, col_name, value, year):
    # Fill NaN values in 'train' for rows where 'year' is 2024
    train_mask = (train["year"] == year)
    train.loc[train_mask, col_name] = train.loc[train_mask, col_name].fillna(value)

    # Fill NaN values in 'test' for rows where 'year' is 2024 AND 'country_id' is 'Kenya'
    test_mask = (test["year"] == year) & (test["country_id"] == "Kenya")
    test.loc[test_mask, col_name] = test.loc[test_mask, col_name].fillna(value)
    return train, test

train,test = fillna_k(train,test,'Inflation, consumer prices (annual %)',2.75,2024)
train,test = fillna_k(train,test,'Official exchange rate (LCU per US$, period average)',134.99,2024)
train,test = fillna_k(train,test,'Real interest rate (%)',4.35,2024)
train,test = fillna_k(train,test,'Deposit interest rate (%)',11.25,2024)
train,test = fillna_k(train,test,'Lending interest rate (%)',12,2024)
train,test = fillna_k(train,test,'Interest rate spread (lending rate minus deposit rate, %)',7,2024)
train,test = fillna_k(train,test,'Unemployment rate',5.7,2024)

In [239]:
test.isna().sum()

Unnamed: 0,0
ID,0
customer_id,0
country_id,0
tbl_loan_id,0
lender_id,0
loan_type,0
Total_Amount,0
Total_Amount_to_Repay,0
disbursement_date,0
due_date,0


In [240]:
col_to_drop = ["ID","customer_id","lender_id","tbl_loan_id", "country_id", "disbursement_date", "due_date","Fossil fuel energy consumption (% of total)","Average precipitation in depth (mm per year)"]
train = train.drop(col_to_drop, axis=1)
ids = test["ID"]
test = test.drop(col_to_drop, axis=1)

In [241]:
cate_col = train.select_dtypes(include='object').columns

train[cate_col] = train[cate_col].astype('category')
test[cate_col] = test[cate_col].astype('category')

In [242]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [243]:
train.shape,test.shape

((68654, 43), (18594, 42))

In [244]:
# pip install hmmlearn

In [245]:
from hmmlearn import hmm
import warnings

def applied_hmm(X):
    """
    Applique un HMM à chaque colonne numérique d'un DataFrame et retourne
    un nouveau DataFrame avec les états cachés prédits. Ignore les colonnes
    pour lesquelles l'HMM ne converge pas ou dont la matrice de transition
    n'est pas valide.

    Args:
        X (pd.DataFrame): Le DataFrame d'entrée.

    Returns:
        pd.DataFrame: Un DataFrame avec les états cachés pour chaque colonne.
    """
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    df_hidden_states = pd.DataFrame()

    for col_name in numerical_cols:
        print(f"Traitement de la colonne : {col_name}")
        # 1. Préparation des données
        X_i = X[col_name]  # On prend la colonne courante

        # Gérer les valeurs manquantes en les remplacant par la medianne.
        if X_i.isnull().any():
            median_value = X_i.median()
            X_i.fillna(median_value, inplace=True)

        # Assurez-vous que X_i est un Series Pandas
        if not isinstance(X_i, pd.Series):
            X_i = pd.Series(X_i)

        # Convertir en tableau NumPy et remodeler
        X_values = X_i.values
        X_reshaped = X_values.reshape(-1, 1)

        # 2. Entraînement du HMM
        # Initialiser et entraîner le modèle HMM
        try:
          model = hmm.GaussianHMM(n_components=3, covariance_type="diag", n_iter=100, random_state=42)
          # Initialiser la matrice de transition à des valeurs valides
          model.transmat_ = np.array([[0.5, 0.3, 0.2], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]])
        except ValueError as e:
          print(f"Erreur lors de l'initialisation du modèle pour la colonne {col_name}: {e}")
          continue

        # Ignore les avertissements de convergence
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning, module="hmmlearn")

            # Entraîner le modèle
            try:
              model.fit(X_reshaped)
            except ValueError as e:
              print(f"Erreur lors de l'entrainement du modèle pour la colonne {col_name}: {e}")
              continue
            #Vérifier la convergence
            if not model.monitor_.converged:
                print(f"  Le modèle HMM n'a pas convergé pour la colonne : {col_name}. Colonne ignorée.")
                continue  # Passer à la colonne suivante si le modèle ne converge pas

        # 3. Prédiction des états cachés
        try:
          hidden_states = model.predict(X_reshaped)
        except ValueError as e:
          print(f"Erreur lors de la prédiction des etats cachés pour la colonne {col_name}: {e}")
          continue
        # Ajouter les états cachés au DataFrame de sortie
        df_hidden_states[f'hidden_state_{col_name}'] = hidden_states

    return df_hidden_states

In [246]:
X=train.drop('target',axis=1)
y=train['target']

In [247]:
X_enriched = applied_hmm(X)

X = pd.concat([X, X_enriched], axis=1)



Traitement de la colonne : Total_Amount




Traitement de la colonne : Total_Amount_to_Repay




Traitement de la colonne : duration




Traitement de la colonne : Amount_Funded_By_Lender




Traitement de la colonne : Lender_portion_Funded




Traitement de la colonne : Lender_portion_to_be_repaid




Traitement de la colonne : year




Erreur lors de la prédiction des etats cachés pour la colonne year: transmat_ rows must sum to 1 (got row sums of [1. 0. 1.])
Traitement de la colonne : month




Traitement de la colonne : day




Traitement de la colonne : day_of_week




Traitement de la colonne : quarter




Traitement de la colonne : Inflation, consumer prices (annual %)




Traitement de la colonne : Official exchange rate (LCU per US$, period average)




Traitement de la colonne : Real interest rate (%)




Traitement de la colonne : Deposit interest rate (%)




Traitement de la colonne : Lending interest rate (%)




Traitement de la colonne : Interest rate spread (lending rate minus deposit rate, %)




Traitement de la colonne : Unemployment rate




In [248]:
test_enriched = applied_hmm(test)
test_enriched = test_enriched.reset_index(drop=True)
test = test.reset_index(drop=True)
test = pd.concat([test, test_enriched], axis=1)



Traitement de la colonne : Total_Amount




Traitement de la colonne : Total_Amount_to_Repay




Traitement de la colonne : duration




Traitement de la colonne : Amount_Funded_By_Lender




Traitement de la colonne : Lender_portion_Funded




Traitement de la colonne : Lender_portion_to_be_repaid




Traitement de la colonne : year




Traitement de la colonne : month




Traitement de la colonne : day




Traitement de la colonne : day_of_week




Traitement de la colonne : quarter




Traitement de la colonne : Inflation, consumer prices (annual %)




Traitement de la colonne : Official exchange rate (LCU per US$, period average)




Traitement de la colonne : Real interest rate (%)




Traitement de la colonne : Deposit interest rate (%)




Traitement de la colonne : Lending interest rate (%)




Traitement de la colonne : Interest rate spread (lending rate minus deposit rate, %)




Traitement de la colonne : Unemployment rate




In [249]:
common_cols = X.columns.intersection(test.columns)
X = X[common_cols]
test = test[common_cols]

# Modelisation

In [250]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [251]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [252]:
rf = RandomForestClassifier(n_estimators=100,
                            random_state=42)
rf.fit(X_train, y_train)

In [253]:
y_predict = rf.predict(X_test)

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     13487
           1       0.87      0.29      0.44       244

    accuracy                           0.99     13731
   macro avg       0.93      0.65      0.71     13731
weighted avg       0.99      0.99      0.98     13731



# XGBOOST

In [254]:
import xgboost as xgb

In [255]:
#y_train = y_train.squeeze()

# Convert the dataframe to a numpy array
#X_train=X_train.to_numpy()

#y_test = y_test.squeeze()

# Convert the dataframe to a numpy array
#X_test=X_test.to_numpy()


In [262]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Définir les paramètres
params = {
    'objective': 'multi:softmax', # Pour classification multi-classes
    'num_class': 2,              # Nombre de classes
    'max_depth': 4,              # Profondeur maximale de l'arbre
    'eta': 0.3,                  # Taux d'apprentissage
    'seed': 42
}

# Entraîner le modèle
model = xgb.train(params, dtrain, num_boost_round=150)

In [263]:
y_pred = model.predict(dtest)

# Évaluer la précision
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     13487
           1       0.90      0.65      0.75       244

    accuracy                           0.99     13731
   macro avg       0.95      0.82      0.88     13731
weighted avg       0.99      0.99      0.99     13731



# Optimisation

In [274]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report
from numpy import mean
from collections import Counter

class_dist = Counter(y_train)
scale_pos_weight = class_dist[0] / class_dist[1]

# Assuming X_train and y_train are your training data and labels
# X_valid and y_valid are your validation set

# Initialize the XGBoost classifier
model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    objective='binary:logistic',  # Specify the objective for binary classification
    eval_metric='auc',
    use_label_encoder=False,  # Avoid warning
    random_state=42,
    n_estimators=400,
    max_depth=5,
    learning_rate=0.3
)

# Define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=3, random_state=1)

# Convert data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)

# Evaluate the model using cross-validation
scores = []
models = []  # to save each model
for train_index, test_index in cv.split(X_train, y_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    class_dist = Counter(y_train_cv)
    scale_pos_weight = class_dist[0] / class_dist[1] - 10
    model_cv = xgb.XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        objective='binary:logistic',  # Specify the objective for binary classification
        eval_metric='auc',
        use_label_encoder=False,  # Avoid warning
        random_state=42,
        n_estimators=400,
        max_depth=5,
        learning_rate=0.15
    )
    model_cv.fit(X_train_cv, y_train_cv)
    models.append(model_cv)

    y_pred_proba = model_cv.predict_proba(X_test_cv)[:, 1]
    scores.append(roc_auc_score(y_test_cv, y_pred_proba))

# Summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.98840


In [275]:
predictions = []
for model in models:
    predictions.append(model.predict_proba(test)[:, 1])  # use all models to do the prediction

# averaging the prediction of all model
average_predictions = mean(predictions, axis=0)

# Create a DataFrame for submission
sub = pd.DataFrame({'ID': ids, 'target': average_predictions})

# Binarize the predictions based on a threshold
# If you want you can play with the value of the threshold
threshold = 0.52
# best 0.52
sub['target'] = (sub['target'] >= threshold).astype(int)

In [276]:
sub.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,17817
1,777


In [277]:
# save your submission
sub.to_csv('./Submissions/submission_xg_opt_with_sub_data_correction.csv', index=False)