In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import shap
import warnings

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

sys.path.append("../")

from scripts.preprocessing import preprocess_application, get_cols_missing_thresh, get_binary_column

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


# Notebook created to offer a solution for overfitting :
- The other undersampling uses a lot of imputations that can easily lead to overfitting.
- This attempt will use solely the data of train, which will offer the less imputations possible
- The near miss algorithm will be applied to balance the classes
- The ids of the data train will be recovered (SK_ID_CURR)
- We will undersample the main dataset based on these ids and conduct a reduction via SHAP & xGboost
- the preprocessing of application will use some imputation of nans as well but it will be minimal compared to the rest

In [2]:
df_train = pd.read_csv(filepath_or_buffer="../data/application_train.csv")


In [3]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df_train = preprocess_application(dataframe=df_train, handle_na=True)

print("done")


Imputation ...
done


In [4]:
na_cols = df_train.columns[df_train.isna().any()].tolist()


#### Rather than introducing bias through imputations, we will just ignore the cols that contain NA, this will minize the bias

In [5]:
df_train.drop(columns=na_cols, inplace=True)


In [6]:
nm1 = NearMiss(sampling_strategy="majority", version=1, n_jobs=-1)


In [7]:
X = df_train.drop(columns=["TARGET"])
y = df_train["TARGET"]

X_resampled, y_resampled = nm1.fit_resample(X=X, y=y)


In [8]:
ids_kept = X_resampled["SK_ID_CURR"].values


##### While it is true that SK_ID_CURR considered as a variable might introduce bias, it is potentially less harmful than applying imputations on the whole dataset

In [9]:
df_whole = pd.read_pickle(filepath_or_buffer="../data/home_credit_data.pkl")


In [10]:
df_model = df_whole[df_whole["SK_ID_CURR"].isin(ids_kept)]


#### Simplification can both lead to a lesser chance of overfitting and a better computation time, let's use SHAP to filter the columns that dont contribute to an xGboost (default values)

In [11]:
# Splitting targets and values, dropping ids : 
target_and_id = ["TARGET", "SK_ID_CURR"]

X_train, X_test, y_train, y_test = train_test_split(
    df_model.drop(columns=target_and_id),
    df_model["TARGET"],
    test_size=0.3,
    random_state=123
    )


In [12]:
clf = XGBClassifier()
clf.fit(X_train, y_train)


In [13]:
def get_low_importance_features_shap(clf: XGBClassifier, X: np.ndarray, threshold: float = 0):
    """
    Calculates SHAP values for an XGBoost classifier and returns the features that have an importance score less than or
    equal to the threshold.

    Args:
    - clf : An XGBoost classifier. (fitted)
    - X: An array-like object containing the input features.
    - threshold : A float indicating the threshold for selecting features based on SHAP values.

    Returns:
    A list of feature names that have an importance score less than or equal to the threshold based on SHAP values.
    """

    # Calculate SHAP values for the input features
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X)

    # Calculate the mean absolute SHAP value for each feature
    shap_values_abs = np.abs(shap_values)
    shap_values_mean = np.mean(shap_values_abs, axis=0)

    # Normalize the SHAP values to get a score between 0 and 1
    shap_values_norm = shap_values_mean / np.sum(shap_values_mean)

    # Find the features that have an importance score less than or equal to the threshold
    low_importance_feats = list(X.columns[shap_values_norm <= threshold])

    return low_importance_feats


In [14]:
zero_importance_feats = get_low_importance_features_shap(clf=clf, X=X_train)


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


In [15]:
zero_importance_feats.__len__()


200

In [16]:
df_model = df_model.drop(columns=zero_importance_feats)

df_model.drop(columns=["SK_ID_CURR"], inplace=True)


# With NANS

In [17]:
df_model.to_pickle(path="../data/df_hc_nm.pkl")


# Without NANS :

In [18]:
model_binaries = get_binary_column(df_model)


In [19]:
# Binaries : sentinel
for col in model_binaries:
    df_model[col].fillna(value=-1, inplace=True)


In [20]:
missing_20 = get_cols_missing_thresh(dataframe=df_model, threshold=0.2)


In [21]:
df_model.drop(columns=missing_20, inplace=True)


In [22]:
imputer = SimpleImputer(strategy="mean")
df_imputed = pd.DataFrame(imputer.fit_transform(df_model), columns=df_model.columns)


In [23]:
df_imputed.to_pickle(path="../data/df_hc_nm_imputed.pkl")
