# About

- This is my code for `Credit Card Lead Prediction` at Analytics Vidhya JOB-A-THON May 2021.

- My final approach is listed under `Iteration 3` in  `Experimenting and Improving` Section.

# Table of Contents

- [1. Imports](#1)


- [2. Import Data](#2)


- [3. Baselines - fill Credit_Product with 'No', Convert to numerical data, use default RF classifier , xgb classifier, xgbrf classifier, LightGBM](#3)
   
   
- [4. Experimenting and Improving](#4)

<a name='1'></a>
# 1. Imports

In [25]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import sklearn
import category_encoders as ce
import warnings

from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

plt.rcParams['figure.dpi'] = 200
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
sns.set_palette('Set2')
%matplotlib inline

warnings.filterwarnings('ignore')

random_seed = 42
np.random.seed(42)

<a name='2'></a>
# 2. Import Data

In [26]:
train = pd.read_csv('./data/train_s3TEQDk.csv')
train_folds = pd.read_csv('./data/train_kfolds.csv') #Check Creating Folds Under Section 4
test = pd.read_csv('./data/test_mSzZ8RL.csv')
sample_sub = pd.read_csv('./data/sample_submission_eyYijxG.csv')

<a name='3'></a>
# 3. Baselines - fill Credit_Product with 'No', Convert to numerical data, use default RF classifier , xgb classifier, xgbrf classifier, LightGBM

## 3.1. Filling Missing Values

In [14]:
train['Credit_Product'].fillna('No', inplace=True)
test['Credit_Product'].fillna('No', inplace=True)

## 3.2. Label Encoding

In [17]:
le = LabelEncoder()

In [18]:
cat_cols = ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active']

for col in cat_cols:
    train[col] = le.fit_transform(train[col].values.ravel())
    test[col] = le.transform(test[col].values.ravel())

## 3.3. Modelling 

### Random Forest

In [67]:
rfclf = RandomForestClassifier()

In [None]:
rfclf.fit(train.iloc[:, 1:-1], train['Is_Lead'].values)

In [None]:
preds = rfclf.predict_proba(test.iloc[:, 1:])[:, 1]

sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/baseline_rf.csv', index=False)

### XGB, XGBRF Classifiers

In [29]:
clf1 = XGBClassifier(use_label_encoder=False)
clf2 = XGBRFClassifier(use_label_encoder=False)

In [31]:
clf1.fit(train.iloc[:, 1:-1], train['Is_Lead'].values)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [33]:
preds = clf1.predict_proba(test.iloc[:, 1:])[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/baseline_xgb.csv', index=False)



In [32]:
clf2.fit(train.iloc[:, 1:-1], train['Is_Lead'].values)



XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=8, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact',
                use_label_encoder=False, validate_parameters=1, verbosity=None)

In [35]:
preds = clf2.predict_proba(test.iloc[:, 1:])[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/baseline_xgbrf.csv', index=False)



### LightGBM

In [23]:
lgclf = LGBMClassifier()

In [24]:
lgclf.fit(train.iloc[:, 1:-1], train['Is_Lead'].values)

LGBMClassifier()

In [25]:
preds = lgclf.predict_proba(test.iloc[:, 1:])[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/baseline_lgbm.csv', index=False)

# Conclusion

LightGBM and XGBoost worked best. So I decided to use them for further modelling.

<a name='4'></a>
# 4. Experimenting and Improving

## Creating Folds for Cross Validation

In [3]:
# Uncomment and run(Needs to be run only once)
# X = train.copy()
# y = train['Is_Lead'].copy()

# X['kfold'] = -1
# X = X.sample(frac=1).reset_index(drop=True)

# kf = StratifiedKFold(n_splits=5)

# for fold, (t_, v_) in enumerate(kf.split(X = X, y = y)):
#     X.loc[v_, 'kfold'] = fold
    
# X.to_csv('./data/train_kfolds.csv', index=False)

## Iteration 1 - Impute 'Credit_Product' using KNN Imputer, ordinal encode 'Occupation' column, label encode rest, use base LGBM, XGB

In [64]:
le = LabelEncoder()

In [65]:
for col in ['Gender', 'Channel_Code', 'Is_Active', 'Region_Code']:
    train_folds[col] = le.fit_transform(train_folds[col].values.ravel())
    test[col] = le.transform(test[col].values.ravel())

In [8]:
oe = OrdinalEncoder()

In [9]:
train_folds['Occupation'] = oe.fit_transform(train_folds[['Occupation']])
test['Occupation'] = oe.transform(test[['Occupation']])

In [10]:
credit_col = {'Yes': 1, 'No': 0}

train_folds['Credit_Product'].replace(credit_col, inplace=True)
test['Credit_Product'].replace(credit_col, inplace=True)

In [12]:
imputer = KNNImputer(n_neighbors=1)

In [13]:
train_folds.iloc[:, 1:-2] = imputer.fit_transform(train_folds.iloc[:, 1:-2])
test.iloc[:, 1:] = imputer.transform(test.iloc[:, 1:])

### Base LightGBM

In [31]:
lgclf = LGBMClassifier()

In [33]:
lgclf.fit(train_folds.iloc[:, 1:-2], train_folds['Is_Lead'].values)

LGBMClassifier()

In [35]:
preds = lgclf.predict_proba(test.iloc[:, 1:])[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/base_lgbm.csv', index=False)

### Base XGB

In [14]:
xgb = XGBClassifier(use_label_encoder = False)

In [15]:
xgb.fit(train_folds.iloc[:, 1:-2], train_folds['Is_Lead'].values)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [16]:
preds = xgb.predict_proba(test.iloc[:, 1:])[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/base_xgb.csv', index=False)



## Iteration 2 - Target encode all categorical variables, knn impute 'Credit_Product' and use base LGBM

In [61]:
te = ce.TargetEncoder(cols=cat_cols)

In [62]:
train_folds[cat_cols] = te.fit_transform(train_folds[cat_cols], y=X['Is_Lead'])
test[cat_cols] = te.transform(test[cat_cols])

  elif pd.api.types.is_categorical(cols):


In [69]:
credit_col = {'Yes': 1, 'No': 0}

train_folds['Credit_Product'].replace(credit_col, inplace=True)
test['Credit_Product'].replace(credit_col, inplace=True)

In [70]:
imputer = KNNImputer(n_neighbors=1)

In [71]:
train_folds.iloc[:, 1:-2] = imputer.fit_transform(train_folds.iloc[:, 1:-2])
test.iloc[:, 1:] = imputer.transform(test.iloc[:, 1:])

In [72]:
credit_col_rev = {1: 'Yes', 0: 'No'}

train_folds['Credit_Product'].replace(credit_col_rev, inplace=True)
test['Credit_Product'].replace(credit_col_rev, inplace=True)

In [73]:
te1 = ce.TargetEncoder(cols=['Credit_Product'])

In [74]:
train_folds['Credit_Product'] = te1.fit_transform(train_folds['Credit_Product'], y=X['Is_Lead'])
test['Credit_Product'] = te1.transform(test['Credit_Product'])

  elif pd.api.types.is_categorical(cols):


In [76]:
lgclf = LGBMClassifier()

In [77]:
lgclf.fit(train_folds.iloc[:, 1:-2], train_folds['Is_Lead'].values)

LGBMClassifier()

In [78]:
preds = lgclf.predict_proba(test.iloc[:, 1:])[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/base_lgbm_target_encoding_all.csv', index=False)

## Iteration 3 - Label Encode all categorical variables, dont impute null values, use lgbm with tuning, xgb with tuning

In [27]:
le = LabelEncoder()

cat_cols = ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Is_Active', 'Credit_Product']

for col in cat_cols[:-1]:
    train_folds[col] = le.fit_transform(train_folds[col].values.ravel())
    test[col] = le.transform(test[col].values.ravel())

In [28]:
credit_col = {'Yes': 1, 'No': 0}

train_folds['Credit_Product'].replace(credit_col, inplace=True)
test['Credit_Product'].replace(credit_col, inplace=True)

In [29]:
FOLD_MAPPING = {
    0: [1, 2, 3, 4],
    1: [0, 2, 3, 4],
    2: [0, 1, 3, 4],
    3: [0, 1, 2, 4],
    4: [0, 1, 2, 3]
}

In [30]:
def train_kfolds(model, data, FOLD_MAPPING, metric):
    scores = []

    for fold in range(len(FOLD_MAPPING)):
        X_train = data[data.kfold.isin(FOLD_MAPPING.get(fold))].reset_index(drop=True)
        X_val = data[data.kfold==fold].reset_index(drop=True)

        X_train = X_train.drop(['ID', 'kfold'], axis=1)
        X_val = X_val.drop(['ID', 'kfold'], axis=1)

        model.fit(X_train.drop('Is_Lead', axis=1), X_train['Is_Lead'].values)
        val_preds = model.predict_proba(X_val.drop('Is_Lead', axis=1))[:, 1]

        score = roc_auc_score(X_val['Is_Lead'].values, val_preds)
        print(f'Fold :{fold}, Metric Value: {score}')
        scores.append(score)

    print(f'Average Metric Value: {np.mean(scores)}')
    

### LGBM

In [62]:
lgbclf = LGBMClassifier(n_jobs=-1,
                        random_state=random_seed,
                        objective='binary',
                        reg_lambda = 1,
                        reg_alpha= 1,
                        n_estimators=600,
                        min_child_weight=1,
                        max_depth=5,
                        learning_rate= 0.02, 
                        num_leaves=30)

In [8]:
train_kfolds(lgbclf, train_folds, FOLD_MAPPING, roc_auc_score)

Fold :0, Metric Value: 0.8776186871673771
Fold :1, Metric Value: 0.8731639054326303
Fold :2, Metric Value: 0.8721196039742922
Fold :3, Metric Value: 0.8715981327187601
Fold :4, Metric Value: 0.8754079573408753
Average Metric Value: 0.873981657326787


In [9]:
lgbclf.fit(train_folds.drop(['ID', 'kfold', 'Is_Lead'], axis=1), train_folds['Is_Lead'].values)

LGBMClassifier(learning_rate=0.02, max_depth=5, min_child_weight=1,
               n_estimators=600, num_leaves=30, objective='binary',
               random_state=42, reg_alpha=1, reg_lambda=1)

In [10]:
preds = lgbclf.predict_proba(test.drop('ID', axis=1))[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/tuned_lgbm_all_label_encode_no_impute.csv', index=False)

### XGB

In [61]:
xgb = XGBClassifier(subsample=1.0,
                    n_estimators=300,
                    min_child_weight = 10,
                    max_depth = 5,
                    learning_rate =0.1,
                    gamma = 5,
                    colsample_bytree = 0.6,
                    eval_metric='logloss',
                    use_label_encoder=False,
                    random_state=random_seed,
                    n_jobs=-1)

In [12]:
train_kfolds(xgb, train_folds, FOLD_MAPPING, roc_auc_score)

Fold :0, Metric Value: 0.8780586023582548
Fold :1, Metric Value: 0.8730653788965108
Fold :2, Metric Value: 0.8726760776060061
Fold :3, Metric Value: 0.8718272355297895
Fold :4, Metric Value: 0.8750544286829585
Average Metric Value: 0.8741363446147039


In [13]:
xgb.fit(train_folds.drop(['ID', 'kfold', 'Is_Lead'], axis=1), train_folds['Is_Lead'].values)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, eval_metric='logloss',
              gamma=5, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=10, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=-1,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1.0, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [14]:
preds = xgb.predict_proba(test.drop('ID', axis=1))[:, 1]
sample_sub['Is_Lead'] = preds
sample_sub.to_csv('./output/tuned_xgb_all_label_encode_no_impute.csv', index=False)

### Blending Predictions

In [15]:
xgb_preds = xgb.predict_proba(test.drop('ID', axis=1))[:, 1]
lgb_preds = lgbclf.predict_proba(test.drop('ID', axis=1))[:, 1]
combined_preds = pd.DataFrame({'xgb': xgb_preds, 'lgb': lgb_preds})

In [16]:
combined_preds['avg'] = (combined_preds['xgb'] + combined_preds['lgb']) / 2
combined_preds.head()

Unnamed: 0,xgb,lgb,avg
0,0.046443,0.044297,0.04537
1,0.85388,0.877629,0.865754
2,0.065698,0.078616,0.072157
3,0.025327,0.021346,0.023337
4,0.023241,0.02232,0.02278


In [17]:
sample_sub['Is_Lead'] = combined_preds['avg']
sample_sub.to_csv('./output/combined_xgb_lgb_noimpute_tuned.csv', index=False)

### HyperParameter Tuning

In [28]:
# XGB hyperparameter tuning

# skf = StratifiedKFold(n_splits=5, shuffle = False) 

# params = {
#         'learning_rate': [0.01, 0.1, 0.2],
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5],
#         'n_estimators': [100, 300, 500]
#         }



# random_search = RandomizedSearchCV(xgb,
#                                    param_distributions=params,
#                                    n_iter=10,
#                                    scoring='roc_auc',
#                                    n_jobs=-1,
#                                    cv=skf.split(train_folds.drop(['ID', 'kfold', 'Is_Lead'], axis=1), train_folds['Is_Lead'].values),
#                                    verbose=3,
#                                    random_state=random_seed)





# random_search.fit(train_folds.drop(['ID', 'kfold', 'Is_Lead'], axis=1), train_folds['Is_Lead'].values)


# random_search.best_params_

# {'subsample': 1.0,
#  'n_estimators': 300,
#  'min_child_weight': 10,
#  'max_depth': 5,
#  'learning_rate': 0.1,
#  'gamma': 5,
#  'colsample_bytree': 0.6}


In [29]:
# LGBM Tuning
# param_test = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
#               'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000],
#               'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#               'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
#               'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#               'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

# skf = StratifiedKFold(n_splits=5, shuffle = False)


# random_search = RandomizedSearchCV(
#                         estimator = lgbclf, param_distributions = param_test, 
#                         n_iter = 50,
#                         scoring = 'roc_auc',
#                         cv = skf.split(train_folds.drop(['ID', 'kfold', 'Is_Lead'], axis=1),train_folds['Is_Lead'].values),
#                         refit = True,
#                         random_state = random_seed,
#                         verbose = True,
#                         n_jobs=-1)

# random_search.fit(train_folds.drop(['ID', 'kfold', 'Is_Lead'], axis=1), train_folds['Is_Lead'].values)

# random_search.best_params_


# {'reg_lambda': 1,
#  'reg_alpha': 1,
#  'n_estimators': 600,
#  'min_child_weight': 1,
#  'max_depth': 5,
#  'learning_rate': 0.02}