In [199]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, accuracy_score, classification_report, roc_auc_score

In [200]:
Data_train = pd.read_csv(r'C:\Users\steph\Downloads\train_s3TEQDk.csv')
Data_test = pd.read_csv(r'C:\Users\steph\Downloads\test_mSzZ8RL.csv')

In [201]:
Data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [202]:
Data_train.head(2)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0


## Feature Engineering

### Binning the numerical values to categorical of both Train and Test Data

In [203]:
#Binning Age
bins_age = [1,25,40,60,75,200]
labels_age = ['0','1','2','3','4']
Data_train['Age_Binned']=pd.cut(Data_train["Age"],bins=bins_age,labels=labels_age,precision=0)
Data_test['Age_Binned']=pd.cut(Data_test["Age"],bins=bins_age,labels=labels_age,precision=0)
    #Data_train["Age_Binned"].value_counts()
#Binning Average account balance
bins_bal = [1,1000000,2500000,5000000,7500000,1000000000]
labels_bal = ['0','1','2','3','4']
Data_train['Avg_Account_Balance_Binned']=pd.cut(Data_train["Avg_Account_Balance"],bins=bins_bal,labels=labels_bal,precision=0)
Data_test['Avg_Account_Balance_Binned']=pd.cut(Data_test["Avg_Account_Balance"],bins=bins_bal,labels=labels_bal,precision=0)
    #Data_train["Avg_Account_Balance_Binned"].value_counts()
#Binning Vintage in years
bins_vin = [1,60,120,300]
labels_vin = ['0','1','2']
Data_train['Vintage_Binned']=pd.cut(Data_train["Vintage"],bins=bins_vin,labels=labels_vin,precision=0)
Data_test['Vintage_Binned']=pd.cut(Data_test["Vintage"],bins=bins_vin,labels=labels_vin,precision=0)
    #Data_train["Vintage_Binned"].value_counts()

### Combining two Features

In [204]:
#Train Dataset
Data_train["AGE_GENDER"] = Data_train["Gender"]+Data_train['Age_Binned'].astype(str)
Data_train["AGE_CHANNEL"] = Data_train["Channel_Code"]+Data_train['Age_Binned'].astype(str)
Data_train["OCC_AGE"] = Data_train["Occupation"]+Data_train['Age_Binned'].astype(str)
Data_train["OCC_BAL"] = Data_train["Occupation"]+Data_train['Avg_Account_Balance_Binned'].astype(str)
Data_train["CREDIT_AGE"] = Data_train["Credit_Product"]+Data_train['Age_Binned'].astype(str)
Data_train["CREDIT_BAL"] = Data_train["Credit_Product"]+Data_train['Avg_Account_Balance_Binned'].astype(str)
Data_train["CREDIT_VINT"] = Data_train["Credit_Product"]+Data_train['Vintage_Binned'].astype(str)

#Test Dataset
Data_test["AGE_GENDER"] = Data_test["Gender"]+Data_test['Age_Binned'].astype(str)
Data_test["AGE_CHANNEL"] = Data_test["Channel_Code"]+Data_test['Age_Binned'].astype(str)
Data_test["OCC_AGE"] = Data_test["Occupation"]+Data_test['Age_Binned'].astype(str)
Data_test["OCC_BAL"] = Data_test["Occupation"]+Data_test['Avg_Account_Balance_Binned'].astype(str)
Data_test["CREDIT_AGE"] = Data_test["Credit_Product"]+Data_test['Age_Binned'].astype(str)
Data_test["CREDIT_BAL"] = Data_test["Credit_Product"]+Data_test['Avg_Account_Balance_Binned'].astype(str)
Data_test["CREDIT_VINT"] = Data_test["Credit_Product"]+Data_test['Vintage_Binned'].astype(str)

### Performing chi square test for relationship of the categorical variables and the target

In [205]:
# Categorical columns are ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active'] 
# target variable is Is_lead
Data_Chi2 = Data_train[['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active',
                        'Age_Binned','Avg_Account_Balance_Binned','Vintage_Binned',
                        'AGE_GENDER','AGE_CHANNEL','OCC_AGE','OCC_BAL','CREDIT_AGE',
                        'CREDIT_BAL','CREDIT_VINT','Is_Lead']]

# Ordinal Encoding for the columns
Data_Chi2 = Data_Chi2.apply(LabelEncoder().fit_transform)

# Checking the P values
Xtrain,Xtest,ytrain,ytest = train_test_split(Data_Chi2[['Gender','Region_Code','Occupation','Channel_Code',
                                                        'Credit_Product','Is_Active','Age_Binned',
                                                        'Avg_Account_Balance_Binned',
                                                        'Vintage_Binned','AGE_GENDER','AGE_CHANNEL','OCC_AGE',
                                                       'OCC_BAL','CREDIT_AGE','CREDIT_BAL',
                                                        'CREDIT_VINT']],Data_Chi2['Is_Lead'],
                                             test_size=0.3,random_state=10)
f_p_values = chi2(Xtrain,ytrain)
p_values = pd.Series(f_p_values[1])
p_values.index = Xtrain.columns
p_values.sort_values()

Region_Code                    0.000000e+00
Channel_Code                   0.000000e+00
Credit_Product                 0.000000e+00
Age_Binned                     0.000000e+00
Vintage_Binned                 0.000000e+00
AGE_GENDER                     0.000000e+00
AGE_CHANNEL                    0.000000e+00
CREDIT_AGE                     0.000000e+00
CREDIT_BAL                     0.000000e+00
CREDIT_VINT                    0.000000e+00
OCC_AGE                       3.885964e-191
Is_Active                     9.944392e-169
Avg_Account_Balance_Binned    3.949737e-116
Gender                         3.369807e-90
OCC_BAL                        4.696663e-28
Occupation                     5.102494e-03
dtype: float64

As per the Chi Square, except Occupation all the features have good correlation with the target and the most correlated features are
1) OCC_AGE
2) Is_Active
3) Avg_Account_Balance_binned
4) Gender

### Data Preparation for Model

#### Encoding the train Data

In [206]:
Feature_Columns = ['Is_Active','Avg_Account_Balance_Binned','Gender',
                         'OCC_AGE',"Is_Lead",'OCC_BAL','Credit_Product']
Data_train = Data_train[Feature_Columns]
Data_test = Data_test[['Is_Active','Avg_Account_Balance_Binned','Gender',
                         'OCC_AGE','OCC_BAL','Credit_Product','ID']]

# Retaining the Columns for final output
Data_test_colums = Data_test['ID']
Data_test.drop(['ID'],axis=1,inplace=True)

In [207]:
Data_train.head()

Unnamed: 0,Is_Active,Avg_Account_Balance_Binned,Gender,OCC_AGE,Is_Lead,OCC_BAL,Credit_Product
0,No,1,Female,Other3,0,Other1,No
1,No,0,Female,Salaried1,0,Salaried0,No
2,Yes,1,Female,Self_Employed2,0,Self_Employed1,No
3,No,0,Male,Salaried1,0,Salaried0,No
4,No,0,Female,Salaried1,0,Salaried0,No


In [208]:
y = Data_train["Is_Lead"]
Data_train.drop("Is_Lead",axis=1,inplace=True)
X = pd.get_dummies(Data_train,prefix=Data_train.columns)
X_test = pd.get_dummies(Data_train,prefix=Data_test.columns)

In [209]:
Models = (LGBMClassifier(),DecisionTreeClassifier(),XGBClassifier())
for i in Models:
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size = 0.3,random_state = None)

    model = i
    model.fit(Xtrain,Ytrain)

    ypred = model.predict(Xtest)
    print("Accuracy for %s%% model is: %s%%" % (i ,100*accuracy_score(ypred,Ytest)))
    print("ROC_AUC_Score for %s%% model is: %s%%" % (i,100*roc_auc_score(ypred,Ytest)))

Accuracy for LGBMClassifier()% model is: 85.84741854092624%
ROC_AUC_Score for LGBMClassifier()% model is: 84.8154580969916%
Accuracy for DecisionTreeClassifier()% model is: 85.79180118831222%
ROC_AUC_Score for DecisionTreeClassifier()% model is: 84.77431232085758%
Accuracy for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)% model is: 85.75246208524376%
ROC_AUC_Score for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample

### Training the Model

In [210]:
#Best Classifier

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size = 0.3,random_state = None)
model = LGBMClassifier()
model.fit(Xtrain,Ytrain)
ypred = model.predict(Xtest)
print("Accuracy for this model is: %s%%" % (100*accuracy_score(ypred,Ytest)))
print("ROC_AUC_Score for this model is: %s%%" % (100*roc_auc_score(ypred,Ytest)))

Accuracy for this model is: 85.95051412138149%
ROC_AUC_Score for this model is: 85.21179708204355%


In [211]:
Result = pd.DataFrame(model.predict_proba(X_test))
Rresult_Final = pd.concat([Data_test_colums,Result],axis=1)
Rresult_Final.rename(columns={0:'ID'},inplace=True)
Rresult_Final.to_csv(r"C:\Users\steph\Downloads\Credit_solution_8.csv")