### Leveraging Machine learning to predict customers who are likely to default or not.
#### There has been recent cases of credit defaults and Kowope Mart will like to have a system that profiles customers who are worthy of the card with minimum if not zero risk of defaulting.

# This is Qualification Competition for the Data Science Nigeria AI Bootcamp 2020.

In [1]:
#Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Reading in the data
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
target = train['default_status']
#Submission = pd.read_csv('SampleSubmission.csv')

In [None]:
# writing different functions to ensure modularity
#
class Analyzer:
    def __init__(self):
        pass
        
    def check_null(self, data):
        """This function returns a fraction of the
            null values per feature in the data
        ** Args: Data - pandas dataframe
        ** Return: decimal value - (sum of null values per feature / sum of data points)
        """
        nan_cols = [col for col in data.columns if data[col].isnull().sum() > 0]
        print(f"Shape: {data.shape}, Number of Columns with NaN: {len(nan_cols)}")
        return data[nan_cols].isnull().sum()/data.shape[0]

    def plot_feature_importances(self, model, data, num_features=50):
        """Returns a plot of the feature importance as scored by the model
        ** Args: Data - pandas dataframe
                 Model - Algorithm
        ** Return: bar plot
        """
        plt.figure(figsize=(15, 30));
        feature_importance_df = pd.DataFrame(gbc.feature_importances_, columns=['Importance'])
        feature_importance_df['Feature'] = train_data.columns
        sns.barplot(x="Importance", y="Features", data=feature_importance_df.sort_values(by=['Importance'], 
                                                                               ascending = False).head(num_features))
        plt.title(model);


    def categorical_plot(self, data, hue, cols=None):
        """Return a plot of categorical features in a data
        ** Args: Data - pandas dataframe
                 Hue - string
                 Categorical Columns - list
        ** Return: bar plot
        """
        if cols == None: cols = [cname for cname in data.columns if data[cname].dtype == 'object' and data[cname].nunique() < 20]
        for col in cols: 
            if col in data.columns:
                sns.countplot(y=col, hue=hue, data=data)
                plt.show()
                
    def get_score(self, data, target, model=None, prefit=False):
        """Returns the auc score of a base model
        ** Args: model, data, target
        ** Return: auc score - float
        """
        from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import train_test_split

        x_t, x_v, y_t, y_v = train_test_split(data, target, stratify=target, test_size=0.3, random_state=0)
        if model == None:
            model = LogisticRegression().fit(x_t, y_t)
        if not prefit:
            model.fit(x_t, y_t)
        train_score = roc_auc_score(y_t, model.predict_proba(x_t)[:,1])
        test_score = roc_auc_score(y_v, model.predict_proba(x_v)[:,1])
        print(f"Train Score: {train_score:.4f},    Test Score: {test_score:.4f}")
    
    def scale(self, data, target, scale_type='StandardScaler'):
        from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
        from sklearn.model_selection import train_test_split

        x_t, x_v, y_t, y_v = train_test_split(data, target, stratify=target, test_size=0.3, random_state=0)
        ss = StandardScaler()
        mm = MinMaxScaler()
        rs = RobustScaler()
        
        def fit_scaler(self, scaler, x_train, x_val):
            scaler.fit(x_train)
            x_train = scaler.transform(x_train)
            x_val = scaler.transform(x_val)
            return x_train, x_val
        if scale_type == 'StandardScaler':
            x_t, x_v = fit_scaler(ss, x_t, x_v)
        if scale_type == 'RobustScaler':
            x_t, x_v = fit_scaler(rs, x_t, x_v)
        else:
            x_t, x_v = fit_scaler(mm, x_t, x_v)
        return x_t, x_v, y_t, y_v

In [None]:
tools = Analyzer()

In [None]:
train.loc[train['default_status'] == 'yes', 'default_status'] = 1
train.loc[train['default_status'] == 'no', 'default_status'] = 0
train['default_status'] = train['default_status'].astype(int)

In [None]:
plt.subplots(figsize=(20, 15))
sns.heatmap(train.corr())

In [None]:
#Creating a column for rows with columns missing
train["Num_Cols_NaN"] = train.T.isnull().sum()
test["Num_Cols_NaN"] = test.T.isnull().sum()

In [None]:
# Joining train and test together to avoid repeated preprocessing

train['is_train'] = 1
test['is_train'] = 0
all_data = pd.concat([train, test])

train.shape, test.shape, all_data.shape

In [None]:
tools.check_null(all_data)

# Feature Engineering

In [None]:
all_data['avg_risk_factors'] = (all_data['form_field1'].fillna(0) + all_data['form_field2'].fillna(0)) / 2
all_data['avg_severity'] = (all_data['form_field3'] + all_data['form_field4'] + all_data['form_field5']) / 3
all_data['credit_scale'] = (all_data['form_field6'] + all_data['form_field8']) / 2
all_data['avg_tenure'] = (all_data['form_field32'] + all_data['form_field33']) / 2

In [None]:
all_data['def_on_ff1'] = np.NaN
for i in range(all_data.shape[0]):
    if all_data['form_field1'].iloc[i] <= 3200:
        all_data['def_on_ff1'].iloc[i] = 'most default'
    elif all_data['form_field1'].iloc[i] > 3200 and all_data['form_field1'].iloc[i] <= 3400:
        all_data['def_on_ff1'].iloc[i] = 'more default'
    else:
        all_data['def_on_ff1'].iloc[i] = 'less default'

In [None]:
# generating columns 
all_data['def_on_avg'] = np.NaN
for i in range(all_data.shape[0]):
    if all_data['avg_tenure'].iloc[i] <= 100 or all_data['avg_severity'].iloc[i] > 1 and all_data['def_on_ff1'].iloc[i] != 'less_default':
        all_data['def_on_avg'].iloc[i] = 'most often default'
    else:
        all_data['def_on_avg'].iloc[i] = 'less often default'

In [None]:
all_data['form_field32+33'] = all_data['form_field33'] + all_data['form_field32']
all_data['form_field6+8'] = all_data['form_field6'] + all_data['form_field8']
all_data['form_field17+18'] = all_data['form_field17'] + all_data['form_field18']
all_data['form_field19+20'] = all_data['form_field19'] + all_data['form_field20']
all_data['form_field4+46'] = all_data['form_field4'] + all_data['form_field46']

In [None]:
all_data.loc[all_data['avg_tenure'] > 150, 'def_on_ten'] = 'active'
all_data.loc[all_data['avg_tenure'] <= 150, 'def_on_ten'] = 'less active'

In [None]:
all_data['avg_credit_feats'] = np.NaN
pref = np.arange(6,16,1)
feats = ['form_field'+str(i) for i in pref]
for j in range(all_data.shape[0]):
    all_data['avg_credit_feats'].iloc[j] = all_data[feats].iloc[j].sum(axis=0) / len(feats)
all_data.head(10)

In [None]:
all_data['avg_card_feats'] = np.NaN
pref = np.arange(16,28,1)
feats = ['form_field'+str(i) for i in pref]
for j in range(all_data.shape[0]):
    all_data['avg_card_feats'].iloc[j] = all_data[feats].iloc[j].sum(axis=0) / len(feats)

In [None]:
all_data['avg_time_feats'] = np.NaN
pref = np.arange(28,40,1)
feats = ['form_field'+str(i) for i in pref]
for j in range(all_data.shape[0]):
    all_data['avg_time_feats'].iloc[j] = all_data[feats].iloc[j].sum(axis=0) / len(feats)

In [None]:
all_data['avg_other_feats'] = np.NaN
pref = np.arange(42,48,1)
feats = ['form_field'+str(i) for i in pref]
feats.remove('form_field47')
feats.append('form_field50')
for j in range(all_data.shape[0]):
    all_data['avg_other_feats'].iloc[j] = all_data[feats].iloc[j].sum(axis=0) / len(feats)

In [None]:
tools.categorical_plot(all_data[all_data['is_train']==1], 'default_status')

In [None]:
#Saving data to avoid repeatedly engineeering each time I restart runtime

#all_data.to_csv('all_data44.csv', index=False)

In [None]:
all_data=pd.read_csv('all_data44.csv')

In [None]:
# Get the categorical cols
cat_cols = [col for col in all_data.drop(['Applicant_ID'], axis=1)
            if all_data[col].dtype == 'object']
cat_cols.remove('form_field47')
cols_generated = cat_cols

In [None]:
#****** Encoding the categorical cols *********

## One Hot encoding
one_hot_cols = [cname for cname in all_data.columns
                if all_data[cname].nunique() < 10 and all_data[cname].dtype == 'object']
for i in range(len(one_hot_cols)):
    all_data = pd.concat( [all_data, pd.get_dummies(all_data[one_hot_cols[i]], prefix='_')], axis=1 )

# Label encoding
all_data['form_field47'] = all_data['form_field47'].factorize()[0]
for i in range(len(cols_generated)):
    new_col = cols_generated[i] + '_label'
    all_data[new_col] = all_data[cols_generated[i]].factorize()[0]
# # cols_generated.append('form_field47')
# # all_data = all_data.drop([cols_generated], axis=1)

# Count encoding
for i in range(len(cols_generated)):
    all_data[cols_generated[i]] = all_data[cols_generated[i]].map(all_data[cols_generated[i]].value_counts())
    
all_data.shape

In [None]:
# Handling null values

fill_w = ['mean', 'median', 'std', 0, -999]
for i in range(len(fill_w)):
    target = train['default_status']
    train_t = all_data.loc[all_data['is_train']==1].drop(['Applicant_ID', 'default_status',
                                                         'form_field40', 'form_field41', 
                                                          'form_field31', 'is_train',
                                                          'form_field48', 'form_field49'], axis=1)
    print('filling null with: ', fill_w[i], '.......')
    cols = train_t.columns
    for j in range(len(cols)):
        if fill_w[i] == 'mean':
            train_t[cols[j]] = train_t[cols[j]].fillna(train_t[cols[j]].mean())
        elif fill_w[i] == 'median':
            train_t[cols[j]] = train_t[cols[j]].fillna(train_t[cols[j]].median())
        elif fill_w[i] == 'std':
            train_t[cols[j]] = train_t[cols[j]].fillna(train_t[cols[j]].std())
        else:
            train_t[cols[j]] = train_t[cols[j]].fillna(fill_w[i])
    print('Done filling null!')
    print('Getting score ......')
    tools.get_score(train_t, target)

In [None]:
# Handling null values
catt = CatBoostClassifier(verbose=False)

fill_w = ['mean', 'median', 'std', 0, -999]
for i in range(len(fill_w)):
    target = train['default_status']
    train_t = all_data.loc[all_data['is_train']==1].drop(['Applicant_ID', 'default_status',
                                                         'form_field40', 'form_field41', 
                                                          'form_field31', 'is_train',
                                                          'form_field48', 'form_field49'], axis=1)
    print('filling null with: ', fill_w[i], '.......')
    cols = train_t.columns
    for j in range(len(cols)):
        if fill_w[i] == 'mean':
            train_t[cols[j]] = train_t[cols[j]].fillna(train_t[cols[j]].mean())
        elif fill_w[i] == 'median':
            train_t[cols[j]] = train_t[cols[j]].fillna(train_t[cols[j]].median())
        elif fill_w[i] == 'std':
            train_t[cols[j]] = train_t[cols[j]].fillna(train_t[cols[j]].std())
        else:
            train_t[cols[j]] = train_t[cols[j]].fillna(fill_w[i])
    print('Done filling null!')
    print('Getting score ......')
    tools.get_score(train_t, target)

In [None]:
#filling null
all_data = all_data.fillna(-999)

In [None]:
#Splitting Data back to train and test
target = all_data.loc[all_data['is_train']==1]['default_status']
train_data = all_data.loc[all_data['is_train']==1].drop(['Applicant_ID', 'default_status', #'form_field6', 'form_field14',
                                                         'form_field40', 'form_field41', #'form_field47',
                                                          'form_field31', 'is_train', #'form_field14',
                                                          'form_field48', 'form_field49'], axis=1)
test_data = all_data.loc[all_data['is_train']==0].drop(['Applicant_ID', 'default_status', #'form_field6', 'form_field14',
                                                         'form_field40', 'form_field41', 
                                                          'form_field31', 'is_train', #'form_field14',
                                                          'form_field48', 'form_field49'], axis=1)

In [None]:
#Training Catboost model using Stratified KFold
#and making predictions on test data

auc_list = list()
predictions_df = pd.DataFrame()
predictions_df['Applicant_ID'] = test['Applicant_ID']
predictions_df['default_status'] = np.zeros(len(test))

fold = StratifiedKFold(n_splits=10, shuffle= True, random_state= 42)
i = 1

for train_index, test_index in fold.split(train_data, target):
    print('Fold number: ', i)

    x_data, x_val = train_data.iloc[train_index], train_data.iloc[test_index]
    y_data, y_val = target.iloc[train_index], target.iloc[test_index]
    x_test = test_data
  
    cat = CatBoostClassifier(learning_rate=0.005, eval_metric='AUC',
                               use_best_model=True, random_state=0, n_estimators=5000, max_depth=8)
    cat.fit(x_data, y_data, eval_set=[(x_val, y_val)], early_stopping_rounds=300, verbose=100)
        
    train_pred = model.predict_proba(x_data)[:,1]
    test_pred = model.predict_proba(x_val)[:,1]
       
    print('auc score on train:', roc_auc_score(y_data, train_pred))
    print('auc score on test', roc_auc_score(y_val, test_pred))
    auc_list.append(roc_auc_score(y_val, test_pred))
    current_pred = model.predict_proba(x_test)[:,1]
        
    predictions_df['default_status']+=current_pred/fold.n_splits
    i =+ 1
        
print('Mean auc score on test: ', np.mean(auc_list))  

predictions_df.to_csv('submission.csv', index=False)
predictions_df.head()