In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split

In [3]:
path = '../../data/raw/train_data.csv'
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
null_df = df.isna().sum()
len(null_df)

In [None]:
null_df = null_df[null_df > 0]
len(null_df)

In [None]:
null_df.sort_values()

In [None]:
null_list = list(null_df.index)
null_list

In [None]:
df.drop(columns=null_list, axis=1, inplace=True)
df.shape

In [None]:
df.columns

In [None]:
df.customer_ID.nunique()

In [None]:
most_recent = df.groupby('customer_ID').S_2.max()

In [None]:
most_recent = pd.DataFrame(most_recent)
most_recent

In [None]:
most_recent.reset_index(inplace=True)

In [None]:
recent_df = pd.merge(most_recent, df, on=['customer_ID', 'S_2'])

In [None]:
recent_df.info()

In [None]:
label_path = '~/Documents/repos/we-need-more-ram/amex_default_prediction/data/raw/train_labels.csv'
labels = pd.read_csv(label_path)
labels.head()

In [None]:
merged_df = pd.merge(recent_df, labels, on='customer_ID')
merged_df.head()

In [None]:
merged_df.drop(columns=['customer_ID', 'S_2', 'D_63'], axis=1, inplace=True)
merged_df.head()

In [None]:
X = merged_df.drop(columns='target')
y = merged_df['target']

In [None]:
seed = 42

X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2,
                                                            random_state=seed,
                                                            stratify=y)

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBRegressor(objective='binary:logistic', eval_metric='log:loss', random_state=seed)

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_train)

In [None]:
y_train_final = pd.DataFrame(y_train)
y_pred_final = pd.DataFrame(y_pred, columns=['prediction'])

In [None]:
print(amex_metric(y_train_final, y_pred_final))

In [None]:
y_pred_final['binary_outcome'] = y_pred_final['prediction'].apply(lambda x: 1 if x >= 0.5 else 0)
y_pred_final

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
confusion_matrix(y_train_final, y_pred_final['binary_outcome'])

In [None]:
accuracy_score(y_train_final, y_pred_final['binary_outcome'])

In [None]:
y_val_pred = xgb_model.predict(X_validate)

In [None]:
print(amex_metric(y_validate, y_val_pred))