In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About the Dataset
The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables<br>
S_* = Spend variables<br>
P_* = Payment variables<br>
B_* = Balance variables<br>
R_* = Risk variables<br>

with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

My task is to predict, for each customer_ID, the probability of a future payment default (target = 1)

**Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.**

# 1. Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgbm
from lightgbm import LGBMClassifier
import os
import warnings
import numpy as np
import pandas as pd
import gc 
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)


In [None]:
train_df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet').groupby('customer_ID').tail(2).set_index('customer_ID', drop=True).sort_index()

In [None]:
train_target=pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv")

In [None]:
train_target.head()

In [None]:
train_df=pd.merge(train_df, train_target, on='customer_ID')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
cat=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# 2. Data Cleaning

In [None]:
# Missing values
tmp = train_df.isna().sum().mul(100).div(len(train_df)).sort_values(ascending=False)
tmp[:15]

In [None]:
# dropping columns with missing values >70%
missingDF = pd.DataFrame(tmp).reset_index()
drop_cols = missingDF[missingDF[0]>70]["index"].values
print(drop_cols)

#### Dropping Null values >70%

In [None]:
train_df.drop(columns = drop_cols,axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
# For categorical columns
cols = train_df.columns
num_cols = train_df._get_numeric_data().columns

In [None]:
cols

In [None]:
num_cols

In [None]:
cat

In [None]:
numerical_columns = list(set(cols) - set(cat))
filtered_numerical_columns = list(set(train_df[numerical_columns])-{"S_2","customer_ID"})

In [None]:
len(filtered_numerical_columns)

In [None]:
for i in cat:
    print(i + " Attribute is  of Data Type : "+ str(train_df[i].dtypes))

In [None]:
for i in cat:
    train_df[i] = train_df[i].astype("object")
    print(i + " Attribute is  of Data Type : "+ str(train_df[i].dtypes))

In [None]:
train_df[cat].nunique()

In [None]:
# dirtiness in categorical data
for col in cat:
    print('{} has {} values'.format(col,train_df[col].unique()))
    print("\n")


In [None]:
len(cat)

In [None]:
plt.figure(figsize=(20,20))

for i,feature in enumerate(cat):
    plt.subplot(4,3,i+1)
    sns.countplot(train_df[feature])

In [None]:
train_df["target"].unique()

In [None]:
plt.figure(figsize=(20,20))

for i,feature in enumerate(cat):
    plt.subplot(4,3,i+1)
    sns.countplot(train_df[feature],hue=train_df['target'])

In [None]:
sns.countplot(train_df['target'])

Correlation between features

In [None]:
# For numeric columns filling null values
filtered_numerical_columns = train_df.select_dtypes(np.number).columns
train_df[filtered_numerical_columns] = train_df[filtered_numerical_columns].fillna(train_df[filtered_numerical_columns].mean())

In [None]:
train_df.isnull().sum()

In [None]:
for i in filtered_numerical_columns:
    print(i + " Attribute is  of Data Type : "+ str(train_df[i].dtypes))

In [None]:
train_df[filtered_numerical_columns][:5]

### Performing the Feature Encoding
Machine learning models can only work with numerical values. For this reason, it is necessary to transform the categorical values of the relevant features into numerical ones. This process is called feature encoding.

In [None]:
for col in cat:
    print('{} has {} categories'.format(col,train_df[col].nunique()))

In [None]:
train_df['S_2'] = pd.to_datetime(train_df['S_2'], errors='coerce')

In [None]:
# Handling date column

train_df["S_2_day"] =train_df["S_2"].dt.day
train_df["S_2_month"] = train_df["S_2"].dt.month
train_df["S_2_year"] = train_df["S_2"].dt.year

In [None]:
# drop S_2
train_df.drop(columns=["S_2"], axis=1, inplace=True)

In [None]:
train_df['customer_ID'].head()

In [None]:
# handling Cusotmer ID as it has unique data
train_df =train_df.groupby(['customer_ID']).nth(-1).reset_index(drop=True)

In [None]:
train_df.head()

### Label Encoding ---> Because there are less no. of categories in each column
LabelEncoder can be used to normalize labels. It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. Fit label encoder.


In [None]:
 from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
for col in cat:
    train_df[col]=le.fit_transform(train_df[col])

In [None]:
train_df.head()

### Selecting important features
SelectKBest: Feature selection is a technique where we choose those features in our data that contribute most to the target variable. In other words we choose the best predictors for the target variable. The classes in the sklearn.

chi2: A chi-square (χ2) statistic is a test that measures how a model compares to actual observed data. ... The chi-square statistic compares the size any discrepancies between the expected results and the actual results, given the size of the sample and the number of variables in the relationship.

In [None]:
from sklearn.feature_selection import SelectKBest

In [None]:
from sklearn.feature_selection import chi2,f_regression,mutual_info_classif

In [None]:
ind_col=[col for col in train_df.columns if col!='target']
dep_col='target'

### As customer ID is not a usefull metric we will not use it for our model building

In [None]:
X=train_df[ind_col]
y=train_df[dep_col]

In [None]:
X.head()

In [None]:
X.shape

In [None]:
imp_features=SelectKBest(score_func=mutual_info_classif,k=100)

In [None]:
imp_features=imp_features.fit(X,y)

In [None]:
imp_features

In [None]:
imp_features.scores_

In [None]:
datascore=pd.DataFrame(imp_features.scores_,columns=['Score'])

In [None]:
datascore

In [None]:
X.columns

In [None]:
dfcols=pd.DataFrame(X.columns)

In [None]:
dfcols


In [None]:
features_rank=pd.concat([dfcols,datascore],axis=1)
features_rank

In [None]:
features_rank.columns=['features','score']

In [None]:
features_rank

In [None]:
features_rank.nlargest(100,'score')

In [None]:
selected=features_rank.nlargest(100,'score')['features'].values

In [None]:
selected

In [None]:
X_new=X[selected]

In [None]:
cols = X_new.columns.tolist()

In [None]:
X_new[cols]

In [None]:
print(X_new.shape,y.shape)

In [None]:
X_new.head(20)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_new,y,random_state=0,test_size=0.3)

In [None]:
X_train.shape

In [None]:
y_train.value_counts() 

# XGBoost Classifier - For our Model
XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements Machine Learning algorithms under the Gradient Boosting framework. It provides a parallel tree boosting to solve many data science problems in a fast and accurate way.

# Since we are using XGBoost , feature scaling is not required

In [None]:
from xgboost import XGBClassifier

In [None]:
params={'learning-rate':[0,0.5,0.20,0.25],
        'max_depth':[5,8,10],
       'min_child_weight':[1,3,5,7],
       'gamma':[0.0,0.1,0.2,0.4],
       'colsample_bytree':[0.3,0.4,0.7]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
classifier=XGBClassifier()

In [None]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [None]:
random_search.fit(X_train,y_train)

In [None]:
random_search.best_estimator_ 

In [None]:
random_search.best_params_

In [None]:
classifier=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0,
              max_depth=5, min_child_weight=1,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [None]:
classifier.fit(X_train,y_train)

In [None]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


# Let's Predict our model Accuracy.

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import metrics

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
y_pred_prob = classifier.predict_proba(X_test)[:,1]

In [None]:
y_test = pd.DataFrame(y_test, columns=["target"])
y_pred = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_prob = pd.DataFrame(y_pred_prob, columns=["prediction"])

In [None]:
print('MAE:',metrics.mean_absolute_error(y_test,y_pred))
print('MSE:',metrics.mean_squared_error(y_test,y_pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
precision=tp/(tp+fp)
recall=tp/(tp+fn)
print("Precision : ",precision)
print("Recall : ",recall)
f1score=(2*precision*recall)/(precision+recall)
print("F1 score: ",f1score)

#  As the dataset is **imbalanced** accuracy can not be taken as a metric 
### F1 Score is our Model Metric # We Got Very Good F1 Score Using XGBoost : 79.9%<br>

# Official Metric 

In [None]:
# # computing metric score
amex_metric(y_test, y_pred_prob)

# Test Results

In [None]:
test_df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet').groupby('customer_ID').tail(2).set_index('customer_ID', drop=True).sort_index()
test_df.head()

In [None]:
test_df.shape

In [None]:
final_test=test_df[selected]
final_test.head()

In [None]:
final_test.shape

In [None]:
final_test.head()

In [None]:
sel_test=[col for col in final_test.columns if col!='customer_ID']
len(sel_test)

In [None]:
df_final=final_test.copy()

In [None]:
df_final.shape

In [None]:
df_final=df_final.reset_index()

In [None]:
df_final.head()

In [None]:
df_final.drop_duplicates("customer_ID",inplace=True)

In [None]:
df_final.shape

In [None]:
test_data=df_final[sel_test]
test_data.shape

In [None]:
df_final["prediction"]=xgb_classifier.predict(test_data)

In [None]:
df_final.drop(selected,inplace=True,axis=1)
df_final.head()

In [None]:
df_final=df_final.reset_index()

In [None]:
df_final.drop("index",inplace=True,axis=1)

In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:
df_final.head()

In [None]:
df_final.index.nunique()

In [None]:
df_final.shape

In [None]:
df_final.to_csv("Submission_v4.csv",index=False)

# Saving The Model

In [None]:

import joblib
joblib.dump(classifier, "xgboost_classifier_v2.h5")

In [None]:
# load the model
import joblib
xgb_classifier = joblib.load("./xgboost_classifier_v2.h5")

In [None]:
y_pred1=xgb_classifier.predict(X_test)
metrics.r2_score(y_test,y_pred1)