In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from itertools import cycle
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

import warnings, gc
warnings.filterwarnings('ignore')

# Common Functions

In [4]:
# AMEX Metric
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


In [5]:
# Function for model Evaluation
def evaluate(y_test , y_pred):
    print("MAE",mean_absolute_error(y_test,y_pred) , '\n')
    
    print("MSE",mean_squared_error(y_test,y_pred), '\n')
    
    print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)), '\n')
    
    print("RMSLE",np.log(np.sqrt(mean_squared_error(y_test,y_pred))), '\n')
    
    r2 = r2_score(y_test,y_pred)
    print("R Squared",r2, '\n')
    
    Adj_r2 = 1-(1-r2)*(5950-1)/(5950-4-1)
    print("Adjusted R Squared",Adj_r2, '\n')
    
#     accuracy = accuracy_score(y_test, y_pred, normalize=True)
#     print("Accuracy",accuracy, '\n')
    
def fillmissmeadian(dframe):
    for column in dframe.columns:
        median = dframe[column].median()
        dframe[column] = dframe[column].fillna(median)
    return dframe

def fillmissmean(dframe):
    for column in dframe.columns:
        mean = dframe[column].mean()
        dframe[column] = dframe[column].fillna(mean)
    return dframe

#Detecting outliers using the Z-scores

def detect_outliers_zscore(data):
    outliers=[]
    thres = 3
    mean = np.mean(data)
    std = np.std(data)
    # print(mean, std)
    for i in data:
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
            outliers.append(i)
    print(outliers)
    return outliers

def replace_outliers(data):
    outliers=detect_outliers_zscore(data)
    median = np.median(data)
    for i in outliers:
        data.replace(i,median,inplace=True) #Replace with median
    return data

# Load Data

In [6]:
df_train = pd.read_feather('../input/amexfeather/train_data.ftr')
# Keep the latest statement features for each customer
df_train = df_train.groupby('customer_ID').tail(1).set_index('customer_ID')

df_test = pd.read_feather('../input/amexfeather/test_data.ftr')
# Keep the latest statement features for each customer
df_test = df_test.groupby('customer_ID').tail(1).set_index('customer_ID')

df_subm = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")

In [7]:
df_train_row_count, df_train_column_count=df_train.shape
print('Total amount of rows (Train):', df_train_row_count)
print('Total amount of columns (Train):', df_train_column_count)

Total amount of rows (Train): 458913
Total amount of columns (Train): 190


In [8]:
df_test_row_count, df_test_column_count=df_test.shape
print('Total amount of rows (Test):', df_test_row_count)
print('Total amount of columns (Test):', df_test_column_count)

Total amount of rows (Test): 924621
Total amount of columns (Test): 189


# Data Preprocessing and Feature Engineering

* Handling missing values
* Categorical Encoding
* Handling outliers
* Feature Scaling


## Categorical Encoding

In [9]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [9]:
df_train[cat_cols].head()

Unnamed: 0_level_0,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_63,D_64,D_66,D_68
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.0,2.0,1.0,0.0,4.0,0.0,1.0,CR,O,,6.0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.0,2.0,1.0,0.0,-1.0,0.0,1.0,CO,O,,6.0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.0,1.0,1.0,0.0,-1.0,0.0,1.0,CO,R,,6.0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.0,2.0,1.0,0.0,6.0,0.0,1.0,CO,O,,3.0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CO,O,1.0,6.0


In [10]:
df_test[cat_cols].head()

Unnamed: 0_level_0,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_63,D_64,D_66,D_68
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.0,2.0,0.0,0.0,-1.0,1.0,0.0,CR,U,,6.0
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.0,2.0,1.0,0.0,3.0,0.0,1.0,CO,O,,6.0
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.0,2.0,0.0,0.0,3.0,0.0,0.0,CR,U,1.0,4.0
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.0,3.0,0.0,0.0,4.0,0.0,1.0,CL,R,,5.0
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,1.0,6.0,0.0,0.0,-1.0,1.0,0.0,CO,R,,5.0


### Label Encoder

In [10]:
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder()
for cat_feat in cat_cols:
    df_train[cat_feat] = lab_enc.fit_transform(df_train[cat_feat])
    df_test[cat_feat] = lab_enc.transform(df_test[cat_feat])

### One Hot Encoder

In [11]:
# get dummies  It converts categorical data into dummy or indicator variables.
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# onehot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
# trn_encoded_features = pd.DataFrame(onehot_enc.fit_transform(df_train[cat_cols])
# trn_encoded_features.index = df_train.index
# trn_encoded_features
# df_train = pd.concat([df_train[trn_not_cat_features], trn_encoded_features], axis = 1)
# df_train.head(5)
# for cat_feat in cat_cols:
#     df_train[cat_feat] = onehot_enc.fit_transform(df_train[cat_feat])
#     df_test[cat_feat] = onehot_enc.transform(df_test[cat_feat])

## Handling Missing Values - Data Imputation

### Fill missing values with median

In [11]:
df_train_miss = fillmissmeadian(df_train)
df_test_miss = fillmissmeadian(df_test)

### Fill missing values with mean

In [97]:
df_train_miss = fillmissmean(df_train)
df_test_miss = fillmissmean(df_test)

In [76]:
df_train_miss = df_train.fillna(value = 0, inplace = True)
df_test_miss = df_test.fillna(value = 0, inplace = True)

In [14]:
# Check for number of missing values
df_train_miss.isnull().sum()

S_2       0
P_2       0
D_39      0
B_1       0
B_2       0
         ..
D_142     0
D_143     0
D_144     0
D_145     0
target    0
Length: 190, dtype: int64

## Detecting and Handling Outliers

### Detecting Outliers with Z-score

In [None]:
# detect_outliers_zscore(df_train)

### Handling Outliers with Median Imputation

In [15]:
columns=list(df_train.columns)
columns.remove('target')
columns.remove('S_2')
columns

['P_2',
 'D_39',
 'B_1',
 'B_2',
 'R_1',
 'S_3',
 'D_41',
 'B_3',
 'D_42',
 'D_43',
 'D_44',
 'B_4',
 'D_45',
 'B_5',
 'R_2',
 'D_46',
 'D_47',
 'D_48',
 'D_49',
 'B_6',
 'B_7',
 'B_8',
 'D_50',
 'D_51',
 'B_9',
 'R_3',
 'D_52',
 'P_3',
 'B_10',
 'D_53',
 'S_5',
 'B_11',
 'S_6',
 'D_54',
 'R_4',
 'S_7',
 'B_12',
 'S_8',
 'D_55',
 'D_56',
 'B_13',
 'R_5',
 'D_58',
 'S_9',
 'B_14',
 'D_59',
 'D_60',
 'D_61',
 'B_15',
 'S_11',
 'D_62',
 'D_63',
 'D_64',
 'D_65',
 'B_16',
 'B_17',
 'B_18',
 'B_19',
 'D_66',
 'B_20',
 'D_68',
 'S_12',
 'R_6',
 'S_13',
 'B_21',
 'D_69',
 'B_22',
 'D_70',
 'D_71',
 'D_72',
 'S_15',
 'B_23',
 'D_73',
 'P_4',
 'D_74',
 'D_75',
 'D_76',
 'B_24',
 'R_7',
 'D_77',
 'B_25',
 'B_26',
 'D_78',
 'D_79',
 'R_8',
 'R_9',
 'S_16',
 'D_80',
 'R_10',
 'R_11',
 'B_27',
 'D_81',
 'D_82',
 'S_17',
 'R_12',
 'B_28',
 'R_13',
 'D_83',
 'R_14',
 'R_15',
 'D_84',
 'R_16',
 'B_29',
 'B_30',
 'S_18',
 'D_86',
 'D_87',
 'R_17',
 'R_18',
 'D_88',
 'B_31',
 'S_19',
 'R_19',
 'B_32',
 

In [None]:
for i in columns:
    print(i)
    replace_outliers(train_set[i])
    # replace_outliers(df_test[i])

## Create Features

In [12]:
features_avg = ['B_11', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 
                'B_20', 'B_28', 'B_29', 'B_3', 'B_33', 'B_36', 'B_37', 'B_4', 'B_42', 
                'B_5', 'B_8', 'B_9', 'D_102', 'D_103', 'D_105', 'D_111', 'D_112', 'D_113', 
                'D_115', 'D_118', 'D_119', 'D_121', 'D_124', 'D_128', 'D_129', 'D_131', 
                'D_132', 'D_133', 'D_139', 'D_140', 'D_141', 'D_143', 'D_144', 'D_145', 
                'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 
                'D_49', 'D_50', 'D_51', 'D_52', 'D_56', 'D_58', 'D_62', 'D_70', 'D_71', 
                'D_72', 'D_74', 'D_75', 'D_79', 'D_81', 'D_83', 'D_84', 'D_88', 'D_91', 
                'P_2', 'P_3', 'R_1', 'R_10', 'R_11', 'R_13', 'R_18', 'R_19', 'R_2', 'R_26', 
                'R_27', 'R_28', 'R_3', 'S_11', 'S_12', 'S_22', 'S_23', 'S_24', 'S_26', 
                'S_27', 'S_5', 'S_7', 'S_8', ]

In [13]:
 df_avg = (df_train_miss
            .groupby('customer_ID')
              .mean()[features_avg]
              .rename(columns={f: f"{f}_avg" for f in features_avg})
             )

In [14]:
df_new_feat = pd.concat([df_train_miss, df_avg], axis=1)

In [15]:
df_new_feat.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,S_11_avg,S_12_avg,S_22_avg,S_23_avg,S_24_avg,S_26_avg,S_27_avg,S_5_avg,S_7_avg,S_8_avg
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2018-03-13,0.93457,0.009117,0.009384,1.007812,0.006104,0.13501,0.001604,0.007175,0.108276,...,0.402344,0.184082,0.917969,0.131836,0.936035,0.001281,0.928711,0.034637,0.105652,0.488281
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,2018-03-25,0.880371,0.178101,0.034698,1.003906,0.006912,0.165527,0.00555,0.00507,0.108276,...,0.36377,0.192383,0.920898,0.132812,0.930664,0.003212,0.292236,0.043915,0.208496,0.406494
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,2018-03-12,0.880859,0.009705,0.004284,0.8125,0.006451,0.164917,0.003796,0.007195,0.108276,...,0.280518,0.190918,0.302979,0.13269,0.086487,0.004704,0.333008,0.001824,0.137817,0.009186
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,2018-03-29,0.621582,0.001082,0.012566,1.005859,0.007828,0.287842,0.004532,0.009941,0.108276,...,0.368652,0.054626,0.931641,0.132812,0.957031,0.043701,0.680664,0.022964,0.279541,0.170532
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,2018-03-30,0.87207,0.005573,0.007679,0.815918,0.001247,0.164917,0.000231,0.005527,0.108276,...,0.32666,0.185669,0.297852,0.13208,0.081848,0.002346,0.333008,0.009354,0.137817,0.008591


### Get X and Y

In [16]:
y = df_new_feat['target']
X = df_new_feat.drop('target', axis=1)

### Split train and test sets

In [17]:
from sklearn.model_selection import train_test_split

# creating dataset split for prediction
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42) # 80-20 split

# Checking split 
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (321239, 282)
y_train: (321239,)
X_test: (137674, 282)
y_test: (137674,)


### Drop S_2 column

In [18]:
drop_cols = ['S_2'] 
X_train.drop(drop_cols, inplace=True, axis=1)
X_test.drop(drop_cols, inplace=True, axis=1)

## Feature Scaling

### Standard Sclaer, Min Max Scaler , Z score Scaler

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

# from sklearn.preprocessing import MinMaxScaler
# min_max=MinMaxScaler()
# X_train_minmax = min_max.fit_transform(X_train)
# X_test_minmax = min_max.fit_transform(X_test)

# from sklearn.preprocessing import scale
# X_train_scale=scale(X_train)
# X_test_scale=scale(X_test)

# Model Implementation

## SVM

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
svm_model = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
svm_model.fit(X_train_scale,y_train)

In [None]:
y_pred_svm = svm_model.predict(X_test_scale)

## KNN

In [105]:
from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn_model = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
knn_model.fit(X_train_scale,y_train)

KNeighborsClassifier(n_neighbors=3)

In [24]:
y_pred_knn = knn_model.predict(X_test_scale)

## XGBoost

### Finding best parameters

In [None]:
# This is commented as it takes a very long time to run, the results of randomizedsearchCV is included below
# Grid of hyperparameters to search over
#from sklearn.model_selection import RandomizedSearchCV
#param_random_gb = {'learning_rate': np.arange(0.05,0.55, 0.1),
#                   'n_estimators' : [125,150,175],
#                   'subsample' : np.arange(0.3,1.0, 0.1),
#                   'max_depth':[3,4,5]}

# Use XGBoost Classifier
#from xgboost import XGBClassifier

# Perform RandomizedSearchCV
#mse_random = RandomizedSearchCV(estimator = XGBClassifier(), param_distributions = param_random_gb, 
#                               n_iter = 10,scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)

#mse_random.fit(x_train_split,y_train_split)

#print("Best parameter: ", mse_random.best_params_)
#print("Lowest RMSE: ", np.sqrt(np.abs(mse_random.best_score_)))
#Best parameter:  {'subsample': 0.5, 'n_estimators': 175, 'max_depth': 3, 'learning_rate': 0.15}
#Lowest RMSE:  0.32263831733224874

In [25]:
import xgboost
# xgb_model = xgboost.XGBClassifier(eta=0.1, nrounds=1000, max_depth=8, colsample_bytree=0.5, scale_pos_weight=1.1, booster='gbtree', 
#                                   metric='multi:softmax')
xgb_model=xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.15, subsample=0.5)
xgb_model.fit(X_train_scale, np.ravel(y_train, order='C'))

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.15, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [26]:
y_pred_xgb = xgb_model.predict(X_test_scale)

## LightGBM

In [41]:
# %%time
# # Cross-validation of the classifier
# from sklearn.model_selection import StratifiedKFold
# from sklearn.calibration import CalibrationDisplay
# from lightgbm import LGBMClassifier, log_evaluation
# import datetime

# ONLY_FIRST_FOLD = False

# features = [f for f in  df_new_feat.columns if f != 'customer_ID' and f != 'target']

# def my_booster(random_state=1, n_estimators=1200):
#     return LGBMClassifier(n_estimators=n_estimators,
#                           learning_rate=0.03, reg_lambda=50,
#                           min_child_samples=2400,
#                           num_leaves=95,
#                           colsample_bytree=0.19,
#                           max_bins=511, random_state=random_state)
      
# print(f"{len(features)} features")
# score_list = []
# y_pred_list = []
# kf = StratifiedKFold(n_splits=5)
# for fold, (idx_tr, idx_va) in enumerate(kf.split(df_new_feat, y)):
#     X_tr, X_va, y_tr, y_va, model = None, None, None, None, None
#     start_time = datetime.datetime.now()
#     X_tr = df_new_feat.iloc[idx_tr][features]
#     X_va = df_new_feat.iloc[idx_va][features]
#     y_tr = y[idx_tr]
#     y_va = y[idx_va]
    
#     model = my_booster()
#     with warnings.catch_warnings():
#         warnings.filterwarnings('ignore', category=UserWarning)
#         model.fit(X_tr, y_tr,
#                   eval_set = [(X_va, y_va)], 
#                   eval_metric=[amex_metric],
#                   callbacks=[log_evaluation(100)])
#     X_tr, y_tr = None, None
#     y_va_pred = model.predict_proba(X_va, raw_score=True)
#     score = amex_metric(y_va, y_va_pred)
#     n_trees = model.best_iteration_
#     if n_trees is None: n_trees = model.n_estimators
#     print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
#           f" {n_trees:5} trees |"
#           f"                Score = {score:.5f}{Style.RESET_ALL}")
#     score_list.append(score)
    
#     if INFERENCE:
#         y_pred_list.append(model.predict_proba(test[features], raw_score=True))
        
#     if ONLY_FIRST_FOLD: break # we only want the first fold
    
# print(f"{Fore.GREEN}{Style.BRIGHT}OOF Score:                       {np.mean(score_list):.5f}{Style.RESET_ALL}")

## CatBoost

In [19]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(iterations = 3000, random_state = 42, nan_mode ='Min',task_type ="GPU")
clf.fit(X_train, y_train, eval_set = [(X_test, y_test)], cat_features=cat_cols,  verbose = 100)

Learning rate set to 0.027664
0:	learn: 0.6597552	test: 0.6597037	best: 0.6597037 (0)	total: 55ms	remaining: 2m 45s
100:	learn: 0.2436925	test: 0.2454402	best: 0.2454402 (100)	total: 5.05s	remaining: 2m 24s
200:	learn: 0.2326993	test: 0.2353091	best: 0.2353091 (200)	total: 9.98s	remaining: 2m 18s
300:	learn: 0.2280696	test: 0.2313831	best: 0.2313831 (300)	total: 14.8s	remaining: 2m 12s
400:	learn: 0.2251869	test: 0.2291907	best: 0.2291907 (400)	total: 19.8s	remaining: 2m 8s
500:	learn: 0.2230567	test: 0.2278536	best: 0.2278536 (500)	total: 24.6s	remaining: 2m 2s
600:	learn: 0.2213793	test: 0.2269241	best: 0.2269241 (600)	total: 30.3s	remaining: 2m 1s
700:	learn: 0.2199264	test: 0.2262347	best: 0.2262347 (700)	total: 35.2s	remaining: 1m 55s
800:	learn: 0.2186320	test: 0.2257300	best: 0.2257300 (800)	total: 40.4s	remaining: 1m 50s
900:	learn: 0.2174849	test: 0.2253373	best: 0.2253373 (900)	total: 45.2s	remaining: 1m 45s
1000:	learn: 0.2164174	test: 0.2250088	best: 0.2250088 (1000)	total:

<catboost.core.CatBoostClassifier at 0x7f32ea893290>

In [20]:
y_pred_cat = clf.predict_proba(X_test)[:, 1]

# Evaluate

In [21]:
evaluate(y_test , y_pred_cat)
# evaluate(y_test , y_pred_xgb)
# evaluate(y_test , y_pred_knn)

MAE 0.13877984716484248 

MSE 0.06942114661150037 

RMSE 0.26347893010922213 

RMSLE -1.3337818758326525 

R Squared 0.6371603557361196 

Adjusted R Squared 0.6369162247727798 



In [22]:
#Get AMEX Matric value
X_test_new = X_test[['P_2']].rename(columns={'P_2': 'prediction'})
X_test_new["prediction"] = y_pred_cat #Change this
print(amex_metric(y_test.to_frame() , X_test_new))

0.7851072944027075


# Get Prediction File

In [23]:
df_test_miss

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-10-12,0.568848,0.121399,0.010780,1.009766,0.006924,0.149414,0.000396,0.003576,0.103760,...,0.253906,0.00507,0.007015,0.005913,0.001250,0.006542,0.401611,0.009163,0.003691,0.003220
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,2019-04-15,0.841309,0.126465,0.016556,1.008789,0.009712,0.112183,0.006191,0.011383,0.065674,...,0.253906,0.00507,0.007015,0.004345,0.000866,0.009117,0.401611,0.002197,0.000247,0.007778
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,2019-10-16,0.697754,0.002724,0.001485,0.810059,0.002621,0.166138,0.004887,0.015945,0.065674,...,0.253906,0.00507,0.007015,1.000977,0.008896,0.895996,0.150146,1.009766,0.457764,0.092041
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,2019-04-22,0.513184,0.324707,0.149536,0.205688,0.002277,0.181152,0.005814,0.498535,0.065674,...,0.253906,0.00507,0.007015,1.007812,0.003754,0.919922,0.255371,1.007812,0.500977,0.182983
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,2019-10-22,0.254395,0.768066,0.563477,0.038025,0.502930,0.168335,0.009483,0.831055,0.065674,...,0.253906,0.00507,0.007015,0.006622,0.001140,0.009529,0.401611,0.009407,0.001557,0.000525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,2019-04-14,0.646973,0.003872,0.011307,0.816895,0.003811,0.162964,0.008942,0.028900,0.065674,...,0.253906,0.00507,0.007015,0.003016,0.006851,0.009308,0.401611,0.009392,0.003279,0.005295
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,2019-10-19,0.471191,0.001856,0.084167,0.082520,0.508789,0.856934,0.000462,0.070496,0.065674,...,0.253906,0.00507,0.007015,0.007683,0.003374,0.009781,0.401611,0.008110,0.005432,0.009979
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,2019-04-06,0.206421,0.001038,0.019958,1.003906,0.009598,0.321045,0.077759,0.014290,0.065674,...,0.253906,0.00507,0.007015,0.002304,0.001640,0.002371,0.401611,0.008156,0.004848,0.002119
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,2019-04-08,0.570801,0.034210,0.049774,0.192871,0.002750,0.230835,0.001869,0.099915,0.065674,...,0.253906,0.00507,0.007015,0.005352,0.000086,0.003914,0.401611,0.003855,0.007481,0.006962


In [24]:
 df_avg_test = (df_test_miss
            .groupby('customer_ID')
              .mean()[features_avg]
              .rename(columns={f: f"{f}_avg" for f in features_avg})
             )

In [25]:
df_test_new_feat = pd.concat([df_test_miss, df_avg_test], axis=1)

In [26]:
df_test_new_feat.drop(drop_cols, inplace=True, axis=1)

In [29]:
# No scaling used for catboost
# df_test_scale = scaler.transform(df_test_new_feat)

In [27]:
# y_pred_submission = xgb_model.predict(df_test_scale)
y_pred_submission = clf.predict_proba(df_test_new_feat)[:, 1]

In [28]:
y_pred_submission

array([0.02705097, 0.00582737, 0.06667161, ..., 0.63559147, 0.31202968,
       0.13989868])

In [29]:
df_subm

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0


In [30]:
df_subm["prediction"] = y_pred_submission
df_subm.to_csv('amex_xgb3.csv', index=False)
df_subm

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.027051
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.005827
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.066672
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.324098
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.948890
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.027891
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.834254
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.635591
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.312030
