# <font color="Green">Credit Card Leads Prediction 💳💳💳💳 </font> 
### A JOB-A-THON conducted by Analytics Vidhya
##### The problem statement is to predict the credit card leads using the variables such as Age, Occupation, Avg_Account Balance,etc..
##### It is a binary class classification problem
##### We are going to look at <font color ="Blue">EDA</font>,<font color ="Blue"> Data processing</font> and<font color ="Blue"> Data Modeling</font> and selecting <font color ="Red"> <b>Optimal Threshold</b></font> for optimal split.
![](https://news.mit.edu/sites/default/files/styles/news_article__image_gallery/public/images/201809/MIT-Fraud-Detection-PRESS_0.jpg?itok=n9A9HHwh)

# Reading data and Descriptive statistics

In [None]:
import pandas as pd
import seaborn as sns
import re
sns.set()
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
plt.rc("font", size=14)
import warnings
warnings.simplefilter(action='ignore')
from sklearn.metrics import classification_report
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score
from sklearn.metrics import roc_curve
from statsmodels.tools import add_constant
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm

In [None]:
train = pd.read_csv(r'../input/jobathon-may-2021-credit-card-lead-prediction/train.csv',header=0)
test = pd.read_csv(r'../input/jobathon-may-2021-credit-card-lead-prediction/test.csv',header=0)
submit = pd.read_csv(r'../input/jobathon-may-2021-credit-card-lead-prediction/sample_submission.csv',header=0)

In [None]:
train.head()

In [None]:
train.info()

* It can be observed that only 'Credit_Product' has missing values.

In [None]:
train.describe()

# Visualizations

In [None]:
# As only 'Credit_Product' has missing values, we shall visualize it first.
sns.set_style('ticks')

fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.countplot(data=train, x='Credit_Product', hue='Is_Lead', ax=ax, palette='CMRmap')
ax.set_title('Credit_Product - Is_Lead Plot', size=25, loc='Left', y=1.04)

sns.despine()
plt.show()

In [None]:
# Visualizing Categorical features
# Note we have filled the missng 'Credit_Product' with 'Missing' for sake of visualization
cat_features = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']

plt.figure(figsize=(16, 14))
sns.set(font_scale= 1.2)
sns.set_style('ticks')

for i, feature in enumerate(cat_features):
    plt.subplot(3, 2, i+1)
    sns.countplot(data=train.fillna('Missing'), x=feature, hue='Is_Lead', palette='summer')  
    
sns.despine()

* it is very strange to observe that the missing values do actually have more leads. 
* Thus we shall impute the missing values as 'Missing' instead of 'Yes' or 'No'

In [None]:
# Region Code needs to be visualized more clearly
seq = list(train['Region_Code'].unique())
seq_list=[]
for i in range(len(seq)):
    val = seq[i][2:]
    seq_list.append(int(val))
seq_list.sort()
for i in range(len(seq)):
    seq_list[i] = 'RG'+str(seq_list[i])

fig, ax = plt.subplots(1, 1, figsize=(20, 15))
sns.countplot(data=train, y='Region_Code', hue='Is_Lead', ax=ax, palette='CMRmap',orient='v',order=seq_list)
ax.set_title('Region_Code - Is_Lead Plot', size=25, loc='Left', y=1.04)

sns.despine()
plt.show()

* Not much of information is gained from this variable.

In [None]:
# It was found that the age could be dividen into age groups
plt.figure(figsize=(16, 7))
temp = train.copy()
temp['Age'] = pd.cut(temp.Age, bins=[20, 35, 50, 65, 80, 95])

sns.countplot(data=temp, x='Age', hue='Is_Lead', palette='autumn')

plt.show()

* We shall thus proceed converting the age to categorical variable later

In [None]:
#We shall now plot the numberical variables to look at the distribution
numerical = ['Age','Vintage','Avg_Account_Balance']
sns.pairplot(data=train,x_vars=numerical, hue = 'Is_Lead', palette='Set1')

In [None]:
# We shall log trasform the variables and plot again
temp = train.copy()
temp[numerical] = np.log(train[numerical])
sns.pairplot(data=temp,x_vars=numerical, hue = 'Is_Lead', palette='Set1')

* We shall proceed with taking log trasform of 'Avg_Account_Balance' to obtaion normal distribution

# Data Preprocessing

In [None]:
# processing train and test data together is more convinent.
df = pd.concat([train,test],axis=0)

In [None]:
df.info()

In [None]:
# There is only missing value in Credit_Product which is replaced by 'Missing'
df['Credit_Product'] = df['Credit_Product'].fillna('Missing')
df['Is_Lead'] = df['Is_Lead'].fillna(0)
#We shall now take only log trasnform of 'Avg_Account_Balance' 
df['Avg_Account_Balance'] = np.log(df['Avg_Account_Balance'])
# The age variable is cut into categories of 20-35,35-50,50-65,65-80,80-95
df['Age'] = pd.cut(df.Age, bins=[20, 35, 50, 65, 80, 95]).astype('O')

In [None]:
# The best processing of Region code was to divide in 4 categores: 25,26,27 and 28
df["Region_Code"]= df["Region_Code"].str.extract('(\d+)').astype(int)//10
df["Region_Code"]= df["Region_Code"].astype('O')

In [None]:
#  One hot encoding (Creating Dummy features)
df =pd.get_dummies(df, columns=['Age','Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active'], drop_first=True)

In [None]:
# ID is not important, thus we remove this feature
del df['ID']

In [None]:
new_train = df.iloc[:len(train)]
new_test = df.iloc[len(train):]

In [None]:
new_train.shape,new_test.shape

In [None]:
X = new_train.loc[:, new_train.columns != 'Is_Lead']
y = new_train['Is_Lead']
# Train test split in 75:25 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2519)

In [None]:
# We shall fit Logistic regression to look at the p-values of features before Standard scaling
x_cons = sn.add_constant(X_train)
logit = sm.Logit(y_train,x_cons).fit()
logit.summary()

In [None]:
# Now we can proceed with standard scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
new_test = new_test.loc[:, new_test.columns != 'Is_Lead']
new_test = scaler.transform(new_test)

# Model Fitting

### Logistic Regression

In [None]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train,y_train)
y_prob = clf_lr.predict_proba(X_test)
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, y_prob[:, 
1]) 
   #retrieve probability of being 1(in second column of probs_y)
#pr_auc = roc_auc_score(recall, precision)

plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])

In [None]:
from numpy import argmax

### Finding threshold using roc curve

In [None]:
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, y_prob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

# auc scores
auc_score1 = roc_auc_score(y_test, y_prob[:,1])
print('Roc_Auc=',auc_score1)

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, y_prob[:,1])
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]


# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=400)
plt.show();

print('The Best Threshold for Logistic model is :%f ' % (best_thresh))

### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(max_depth= 9, max_features= 8,n_estimators= 1000,random_state=2519)

In [None]:
cvrf_clf = rf_clf.fit(X_train, y_train)
y_rf = cvrf_clf.predict_proba(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_rf[:, 
1]) 
   #retrieve probability of being 1(in second column of probs_y)
#pr_auc = roc_auc_score(recall, precision)

plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])

In [None]:
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, y_rf[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

# auc scores
auc_score1 = roc_auc_score(y_test, y_rf[:,1])
print('Roc_Auc=',auc_score1)

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, y_rf[:,1])
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Random Forest')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=400)
plt.show();

print('The Best Threshold for Random Forest is :%f ' % (best_thresh))

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
params = {
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.9, 
    'objective':'binary:logistic',
    'eval_metric':'auc',
    'learning_rate': 0.001,
    'random_state' : 2519
}

def XGBmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(X_train,label=y_train)
    matrix_test = xgb.DMatrix(X_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=13,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

In [None]:
y_xg = (model.predict(xgb.DMatrix(X_test), ntree_limit = model.best_ntree_limit))
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, y_xg) 
   #retrieve probability of being 1(in second column of probs_y)
#pr_auc = roc_auc_score(recall, precision)

plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])

In [None]:
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, y_xg, pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

# auc scores
auc_score1 = roc_auc_score(y_test, y_xg)
print('Roc_Auc=',auc_score1)

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, y_xg)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='XGBoost')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=400)
plt.show();

print('The Best Threshold for XGBoost is :%f ' % (best_thresh))

In [None]:
#Confusion Matrix
#Threshold of XGB model is 0.489467
xg_pr = (model.predict(xgb.DMatrix(X_test), ntree_limit = model.best_ntree_limit) >= 0.489467)
cm=confusion_matrix(y_test,xg_pr)
conf_matrix=pd.DataFrame(data=cm,index=['Actual:0','Actual:1'],columns=['Predicted:0','Predicted:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
# We can observe a decent classification by our XGB model
# Recall is more important in these problems as the company can afford False Positives but cannot bear skipping leads
print('Recall =',11373/(11373+3279)) 

# Submission

#### We can observe that the highest roc is obtained for XGBoost, but Random Forest almost also have same roc.
#### Anyways, we shall use the XGB prediction as it has comparitivly higher roc score

In [None]:
# Submiting Random Forest

In [None]:
# Threshold of Random Forest is 0.240617
prediction1 = np.where((cvrf_clf.predict_proba(new_test)[:,1] >= 0.240617),1,0)
prediction1.shape

In [None]:
submit['Is_Lead']=prediction1
submit.head()

In [None]:
submit.to_csv('submit_trail.csv', index=False)

In [None]:
# Threshold of XGB is 0.489467
prediction2 = np.where((model.predict(xgb.DMatrix(new_test), ntree_limit = model.best_ntree_limit) >= 0.489467),1,0)
prediction2.shape

In [None]:
submit['Is_Lead']=prediction2
submit.head()

In [None]:
submit.to_csv('submit_final.csv', index=False)

# Conclusion
#### We have obtained a good roc_auc_score score for test data.
#### The thresholds for each of the model has helped in decent split and we have successfully achieved the objective
## Future Improvements:
#### The models can be tuned for hyperparameter optimization, but because the training data is large, it takes time for parametrs to get tuned.
<img src='https://cdn0.iconfinder.com/data/icons/security-system-3/50/6-512.png'>

## Thanks for being a good reader!!