In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from numpy import where

import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
# importing SimpleImputer for handling missing value
from sklearn.impute import SimpleImputer
# importing MissingIndicator for handling missing value
from sklearn.impute import MissingIndicator
# importing StandardScaler for standardization
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# importing OnHotEncoder for encoding categorical variable
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# importing for transformation
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
# importing PCA for handling dimensonality reduction
from sklearn.decomposition import PCA

from catboost import CatBoostRegressor, Pool

In [None]:
# importing pipeline for chaining model building activities
#from sklearn.pipeline import Pipeline
#from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as mp
# importing FeatureUnion for combining transformers
from sklearn.pipeline import FeatureUnion

# importing samplers for handling data imbalance
from imblearn.combine import SMOTEENN 
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler 

In [None]:
# importing train_test_split for train and validation split
from sklearn.model_selection import train_test_split
# importing SelectFromModel to select features from model 
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_regression

In [None]:
# importing classifiers to try with
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn import metrics

# importing metrics required for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score

# importing RepeatedKFold for cross validation
from sklearn.model_selection import RepeatedKFold
# importing for model evaluation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
# importing RepeatedStratifiedKFold for model evaluation
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold
# importing GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import RandomizedSearchCV
from yellowbrick.model_selection import ValidationCurve
import itertools
from scipy.stats import randint as sp_randInt
from scipy.stats import uniform as sp_randFloat
from scipy.stats import uniform as sp_uniform

In [None]:
# Train data
train_data = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=50000)

print(f'Number of rows in train data: {train_data.shape[0]}')
print(f'Number of columns in train data: {train_data.shape[1]}')
print(f'Number of values in train data: {train_data.count().sum()}')
print(f'Number missing values in train data: {sum(train_data.isna().sum())}')

In [None]:
# Train lables
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

print(f'Number of rows in train labels: {train_labels.shape[0]}')
print(f'Number of columns in train labels: {train_labels.shape[1]}')
print(f'Number of values in train labels: {train_labels.count().sum()}')
print(f'Number missing values in train labels: {sum(train_labels.isna().sum())}')

In [None]:
train = pd.merge(train_data, train_labels, how="inner", on=["customer_ID"])

In [None]:
train_data.head()

In [None]:
# Summary of the train dataset
train_data.info(max_cols=200, show_counts=True)

In [None]:
train["S_2"] = train["S_2"].astype('datetime64[ns]')
train["Day of week"] = train["S_2"].dt.dayofweek
train["Year"] = train["S_2"].dt.year
train["Month"] = train["S_2"].dt.month
train["Day"] = train["S_2"].dt.day

In [None]:
# Data Preprocessing
train.drop(axis=1, columns=['customer_ID', 'S_2'], inplace=True)

In [None]:
# Drop variables with missing values >=75% in the train dataframe

i=0
for col in train.columns:
    if (train[col].isnull().sum()/len(train[col])*100) >=75:
        print("Dropping column", col)
        train.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in train dataframe", i)

In [None]:
#convert dtype for categorical variable to object
train_df = train.astype({"B_30": 'str', "B_38": 'str',"D_114": 'str', "D_116": 'str', "D_117": 'str', 
                         "D_120": 'str', "D_126": 'str', "D_68": 'str', "Day of week": 'str',
                         "Year": 'str', "Month": 'str', "Day": 'str'})

In [None]:
# separate X and y for further processing
X = train_df.drop(columns=['target'])
y = train_df['target']

print("Shape of X", X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 42)
print("Shape of X_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_test", y_test.shape)

In [None]:
# define categorical variables (columns)
categorical = list(X.select_dtypes('object').columns)
print(f"Categorical variables (columns) are: {categorical}")

# define numerical variables (columns)
numerical = list(X.select_dtypes('number').columns)
print(f"Numerical variables (columns) are: {numerical}")

In [None]:
# define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', StandardScaler())
])

print(cat_pipe)

In [None]:
# define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('scaler', StandardScaler())
])
print(num_pipe)

In [None]:
preprocess = ColumnTransformer([
    ('cat', cat_pipe, categorical),
    ('num', num_pipe, numerical)
])

In [None]:
preprocess.fit(X_train)
X_train = preprocess.transform(X_train)
X_test = preprocess.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns = categorical + numerical )
X_test = pd.DataFrame(X_test, columns = categorical + numerical )

In [None]:
X_train.head()

In [None]:
model= RandomForestClassifier(n_estimators=10, random_state=7)
model.fit(X_train, y_train)

In [None]:
fs=pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=True)
fs

In [None]:
plt.figure(figsize=(10, 40))
fs.plot(kind='barh')

In [None]:
fs[fs > .007]

In [None]:
X_train = X_train[fs[fs > .007].index]
X_test = X_test[fs[fs > .007].index]

In [None]:
X_train.shape

In [None]:
y_train

In [None]:
counter = Counter(y_train)
print(counter)

In [None]:
over = SMOTE(random_state = 42)
under = RandomUnderSampler(random_state = 42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

In [None]:
pca = PCA()
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test =pca.transform(X_test)

In [None]:
# Amex Evaluation Metric for reference
# please refer sample notebook provided under competition page for details
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()
    
    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
# Look at confusion matrix 

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.grid(None)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
scoring = ['accuracy', 'precision', 'recall','f1','roc_auc']
cv = StratifiedKFold(n_splits=10, shuffle = True, random_state = 42)

In [None]:
algo=[
    [KNeighborsClassifier(), 'KNeighborsClassifier'], 
    [LogisticRegression(), 'LogisticRegression'], 
    [SVC(), 'SVC'],
    [DecisionTreeClassifier(), 'DecisionTreeClassifier'],
    [GradientBoostingClassifier(), 'GradientBoostingClassifier'],
    [RandomForestClassifier(), 'RandomForestClassifier'],
    [AdaBoostClassifier(), 'AdaBoostClassifier'],
    [GaussianNB(), 'GaussianNB'],
    [XGBClassifier(), "XGBClassifier"],
    [LGBMClassifier(), "LGBMClassifier"],
    [CatBoostClassifier(), "CatBoostClassifier"]
]

model_scores=[]
for a in algo:
    model = a[0]
    print(model)
    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=True,return_estimator=True)
    print('Training Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['train_accuracy']),np.mean(scores['train_precision']), np.mean(scores['train_recall']), np.mean(scores['train_f1']), np.mean(scores['train_roc_auc'])))
    print('Validation Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['test_accuracy']),np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), np.mean(scores['test_roc_auc'])))
    model_scores.append([np.mean(scores['test_accuracy']), a[1]])

print(model_scores)
print(f'best score = {max(model_scores)}')    

In [None]:
model_scores

In [None]:
dscore=pd.DataFrame(model_scores, columns=['score', 'classifier'])
dscore.sort_values('score', ascending=False)

In [None]:
# function for display of model score via RandomizedSearchCV
def model_search_score(model_name):
    print("#######################################################################")
    print("Training and Evaluation with GridSearchCV using",model_name)
    print("#######################################################################")
    result = gs.fit(X_train,y_train)
    model = result.best_estimator_
    score = result.best_score_
    print ("Best Estimator for", model_name,"is", model,"with best score as",score)    

In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=cv, n_jobs = -1)

In [None]:
model_search_score(KNeighborsClassifier())

In [None]:
model = KNeighborsClassifier(metric='manhattan', weights='distance')
model = model.fit(X_train, y_train)
print("#######################################################################")
print("model training score: %.3f" % model.score(X_train, y_train))
print("model test score: %.3f" % model.score(X_test, y_test))
print("#######################################################################")
y_pred = model.predict(X_test)
print("Default rate:", y_pred.sum()/y_pred.shape[0])
print("Amex Evaluation Metric - Training: %.3f"% amex_metric(pd.DataFrame(y_train), 
        pd.DataFrame(model.predict(X_train), columns=['prediction'])))
print("Amex Evaluation Metric - Test: %.3f"% amex_metric(pd.DataFrame(y_test), 
        pd.DataFrame(y_pred, columns=['prediction'])))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
print("#######################################################################")
confusion_mtx = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(confusion_mtx, classes = range(2))
target_names = ['Paid', 'Default']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
y_pred_train = model.predict(X_train)

In [None]:
accuracy_score(y_train, y_pred_train)

In [None]:
accuracy_score(y_test, y_pred)