In [None]:
import pandas as pd

df = pd.read_csv('C:/Users/kaczm/Desktop/Credit Card Risk/default_of_credit_card_clients.csv')
target_variable = pd.read_csv('C:/Users/kaczm/Desktop/Credit Card Risk/default_of_credit_card_target.csv')

In [None]:
# join target variable to the dataset

df['target'] = target_variable

In [None]:
df['target'].value_counts()

# Since the target variable is very imbalanced, we will use SMOTE to balance it


target
0    23364
1     6636
Name: count, dtype: int64

In [None]:
def balance_data(df):
    from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import train_test_split

    X = df.drop('target', axis=1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X_train, y_train)

    return X_smote, X_test, y_smote, y_test

In [None]:
# Now I will build several models to see which one performs the best

def build_models(X_smote, X_test, y_smote, y_test):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    import xgboost as xgb
    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

    models = [RandomForestClassifier(random_state=42), LogisticRegression(random_state=42), xgb.XGBClassifier(random_state=42)]

    best_model_score = 0
    best_model = None

    for model in models:
        model.fit(X_smote, y_smote)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]  

        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba)
        average_score = (accuracy + auc) / 2
        
        print(model)
        print('Accuracy:', accuracy)
        print('AUC:', auc)
        print('Average Score (Accuracy + AUC) / 2:', average_score)
        print('Recall:', recall_score(y_test, y_pred))
        print('Precision:', precision_score(y_test, y_pred))
        print('F1:', f1_score(y_test, y_pred))

        if average_score > best_model_score:
            best_model_score = average_score
            best_model = model

    print('Best model based on average score (Accuracy + AUC) / 2:', best_model)
    print('Best model average score:', best_model_score)
    return best_model

# Example usage:
# best_model = build_models(X_smote, X_test, y_smote, y_test)


In [None]:
# Now balance data and run the models

X_smote, X_test, y_smote, y_test = balance_data(df)

best_model_smote= build_models(X_smote, X_test, y_smote, y_test)


RandomForestClassifier(random_state=42)
Accuracy: 0.7935
AUC: 0.7541261166867702
Average Score (Accuracy + AUC) / 2: 0.7738130583433851
Recall: 0.48743335872048743
Precision: 0.5306799336650083
F1: 0.5081381500595474
LogisticRegression(random_state=42)
Accuracy: 0.7311666666666666
AUC: 0.7437070109006599
Average Score (Accuracy + AUC) / 2: 0.7374368387836633
Recall: 0.6260472201066261
Precision: 0.4228395061728395
F1: 0.5047589806570464


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
Accuracy: 0.7955
AUC: 0.7565485126740505
Average Score (Accuracy + AUC) / 2: 0.7760242563370252
Recall: 0.43716679360243715
Precision: 0.5404896421845574
F1: 0.48336842105263156
Best model based on average score (Accuracy + AUC) / 2: XGBClassif

In [None]:
# So the best model with SMOTE was XGBOOST at 0.7760242563370252 average

# however, we still need to build models using under and over sampling as well as feature selection so first define a feature selection function

def feature_selection_chi_test(df):

    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.preprocessing import MinMaxScaler
    

    X = df.drop('target', axis=1)
    y = df['target']

    # Ensure all feature values are non-negative
    X = X + abs(X.min())  # Shift values to be non-negative

    # Optionally, apply MinMax scaling
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Select the k best features using SelectKBest
    best_features = SelectKBest(score_func=chi2, k=10)
    fit = best_features.fit(X_scaled, y)
    df_scores = pd.DataFrame(fit.scores_)
    df_columns = pd.DataFrame(X.columns)
    
    feature_scores = pd.concat([df_columns, df_scores], axis=1)
    feature_scores.columns = ['Feature', 'Score']
    print(feature_scores.nlargest(10, 'Score'))

    return X[feature_scores.nlargest(10, 'Score')['Feature'].values]


# Now balance data and run the models

X_selected = feature_selection_chi_test(df)




    

                Feature        Score
26         Total_Delays  2298.174644
23  Payment_Consistency   340.269930
30       Cluster_kmeans   218.829117
5                    X6   201.517549
6                    X7   160.027300
7                    X8   129.692706
28     On_Time_Payments   116.272446
8                    X9   108.131709
9                   X10    92.598810
27  Total_Payments_Done    82.647000


In [None]:
# now define a function to balance data using under sampling

def under_sample(df):
    from imblearn.under_sampling import RandomUnderSampler
    from sklearn.model_selection import train_test_split

    X = df.drop('target', axis=1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rus = RandomUnderSampler(random_state=42)
    X_rus, y_rus = rus.fit_resample(X_train, y_train)

    return X_rus, X_test, y_rus, y_test

# Now choose only columns from df based on the selected features, balance and data and run models

df_chi_under = df[X_selected.columns] 
# add target column to the df
df_chi_under['target'] = target_variable
X_rus, X_test, y_rus, y_test = under_sample(df_chi_under)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chi_under['target'] = target_variable


In [None]:
# Now run the models

best_model_rus = build_models(X_rus, X_test, y_rus, y_test)

# So the best model with undersamping and chi test was XGBOOST at 0.7535767458218308 average

RandomForestClassifier(random_state=42)
Accuracy: 0.746
AUC: 0.7359002578960034
Average Score (Accuracy + AUC) / 2: 0.7409501289480017
Recall: 0.6054836252856055
Precision: 0.4414214325374792
F1: 0.5105973025048169
LogisticRegression(random_state=42)
Accuracy: 0.755
AUC: 0.7403323122681703
Average Score (Accuracy + AUC) / 2: 0.7476661561340852
Recall: 0.6039603960396039
Precision: 0.4549627079747562
F1: 0.518979057591623
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child

In [None]:
# Now let's do oversampling and feature selection based on variance

def feature_selection_variance(df):

    from sklearn.feature_selection import VarianceThreshold
    import numpy as np

    X = df.drop('target', axis=1)
    y = df['target']

    # Select features with variance above 0.1
    selector = VarianceThreshold(threshold=0.1)
    selector.fit(X)
    mask = selector.get_support()
    selected_features = X.columns[mask]

    return X[selected_features]


# Now select features based on variance

X_selected_variance = feature_selection_variance(df)


In [None]:
# build function for oversampling

def over_sample(df):

    from imblearn.over_sampling import RandomOverSampler
    from sklearn.model_selection import train_test_split

    X = df.drop('target', axis=1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    ros = RandomOverSampler(random_state=42)
    X_ros, y_ros = ros.fit_resample(X_train, y_train)

    return X_ros, X_test, y_ros, y_test

In [None]:
# Now only choose columns from df based on the selected features, balance data and run models

df_var_over = df[X_selected_variance.columns]
# add target column to the df
df_var_over['target'] = target_variable

X_ros, X_test, y_ros, y_test = over_sample(df_var_over)

# Now run the models

best_model_ros = build_models(X_ros, X_test, y_ros, y_test)

# SO the best model with oversampling and variance based feature selection was logistic regression at 0.7505763854618217 average


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_var_over['target'] = target_variable


RandomForestClassifier(random_state=42)
Accuracy: 0.7775
AUC: 0.7201103634349584
Average Score (Accuracy + AUC) / 2: 0.7488051817174792
Recall: 0.4463061690784463
Precision: 0.490787269681742
F1: 0.46749102512963703


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)
Accuracy: 0.748
AUC: 0.7531527709236434
Average Score (Accuracy + AUC) / 2: 0.7505763854618217
Recall: 0.6085300837776085
Precision: 0.444629938786867
F1: 0.5138263665594855
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
Accuracy: 0.753
AUC: 0.73904884457

In [None]:
# Class selection has so far not improved the model 
# I will also test a self made grid search function to find the best model

def grid_search(df):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    import xgboost as xgb
    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, RFE, SelectFromModel
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.pipeline import Pipeline
    from imblearn.pipeline import Pipeline as ImbPipeline
    import time

    X = df.drop('target', axis=1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = [
        ('RandomForest', RandomForestClassifier(random_state=42), {
            'model__n_estimators': [100],
            'model__max_depth': [20],
            'model__min_samples_split': [5],
            'model__min_samples_leaf': [4]
        }),
        ('LogisticRegression', LogisticRegression(random_state=42, solver='liblinear'), {
            'model__C': [10],
            'model__penalty': ['l1', 'l2']
        }),
        ('XGBoost', xgb.XGBClassifier(random_state=42), {
            'model__n_estimators': [100],
            'model__max_depth': [20],
            'model__learning_rate': [0.1]
        })
    ]

    feature_selection_methods = [
        ('SelectKBest', SelectKBest(score_func=chi2, k=10)),
        ('VarianceThreshold', VarianceThreshold(threshold=0.1)),
        ('RFE', RFE(LogisticRegression(random_state=42), n_features_to_select=10, step=1, verbose=1)),
        ('SelectFromModel', SelectFromModel(RandomForestClassifier(random_state=42)))
    ]

    balancing_methods = [
        ('SMOTE', SMOTE(random_state=42)),
        ('RandomUnderSampler', RandomUnderSampler(random_state=42)),
        ('RandomOverSampler', RandomOverSampler(random_state=42))
    ]

    best_model_score = 0
    best_model = None
    best_params = None

    for model_name, model, param_grid in models:
        for fs_name, feature_selection_method in feature_selection_methods:
            for bm_name, balancing_method in balancing_methods:
                print(f'Fitting model: {model_name}, Feature Selection: {fs_name}, Balancing: {bm_name}')
                start_time = time.time()
                
                # Create a pipeline
                pipeline = ImbPipeline([
                    ('scaler', MinMaxScaler()),
                    ('balancer', balancing_method),
                    ('feature_selector', feature_selection_method),
                    ('model', model)
                ])

                # Combine the parameter grid with model parameters
                grid = GridSearchCV(pipeline, param_grid, scoring='roc_auc', refit=True, cv=3, verbose=3, n_jobs=-1)
                try:
                    grid.fit(X_train, y_train)

                    y_pred = grid.predict(X_test)
                    y_proba = grid.predict_proba(X_test)[:, 1]

                    accuracy = accuracy_score(y_test, y_pred)
                    auc = roc_auc_score(y_test, y_proba)
                    average_score = (accuracy + auc) / 2

                    print(f"Model: {model_name}, Feature Selection: {fs_name}, Balancing: {bm_name}")
                    print('Accuracy:', accuracy)
                    print('AUC:', auc)
                    print('Average Score (Accuracy + AUC) / 2:', average_score)
                    print('Recall:', recall_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred))
                    print('F1:', f1_score(y_test, y_pred))
                    print('Best Params:', grid.best_params_)
                    print(f"--- {time.time() - start_time} seconds ---")

                    if average_score > best_model_score:
                        print('New best model found!: ', model_name, average_score)
                        best_model_score = average_score
                        best_model = grid.best_estimator_
                        best_params = grid.best_params_
                except Exception as e:
                    print(f"Failed to fit model: {model_name}, Feature Selection: {fs_name}, Balancing: {bm_name}")
                    print(str(e))

    print('Best model based on average score (Accuracy + AUC) / 2:', best_model)
    print('Best model average score:', best_model_score)
    print('Best model parameters:', best_params)
    return best_model

# Example usage:
best_model = grid_search(df)




Fitting model: RandomForest, Feature Selection: SelectKBest, Balancing: SMOTE
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Model: RandomForest, Feature Selection: SelectKBest, Balancing: SMOTE
Accuracy: 0.7936666666666666
AUC: 0.7503916538606972
Average Score (Accuracy + AUC) / 2: 0.772029160263682
Recall: 0.5041888804265042
Precision: 0.5300240192153723
F1: 0.516783762685402
Best Params: {'model__max_depth': 20, 'model__min_samples_leaf': 4, 'model__min_samples_split': 5, 'model__n_estimators': 100}
--- 4.008106231689453 seconds ---
New best model found!:  RandomForest 0.772029160263682
Fitting model: RandomForest, Feature Selection: SelectKBest, Balancing: RandomUnderSampler
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Model: RandomForest, Feature Selection: SelectKBest, Balancing: RandomUnderSampler
Accuracy: 0.755
AUC: 0.7487904269575503
Average Score (Accuracy + AUC) / 2: 0.7518952134787751
Recall: 0.6009139375476009
Precision: 0.4547550432276657
F1: 0.

In [None]:
best_model

In [None]:
import joblib

joblib.dump(best_model, 'C:/Users/kaczm/Desktop/Credit Card Risk/best_model.joblib')


['C:/Users/kaczm/Desktop/Credit Card Risk/best_model.joblib']