In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### problem description:
Credit Card Lead Prediction
Happy Customer Bank is a mid-sized private bank that deals in all kinds of banking products, like Savings accounts, Current accounts, investment products, credit products, among other offerings.



The bank also cross-sells products to its existing customers and to do so they use different kinds of communication like tele-calling, e-mails, recommendations on net banking, mobile banking, etc. 



In this case, the Happy Customer Bank wants to cross sell its credit cards to its existing customers. The bank has identified a set of customers that are eligible for taking these credit cards.



Now, the bank is looking for your help in identifying customers that could show higher intent towards a recommended credit card, given:

* Customer details (gender, age, region etc.)
* Details of his/her relationship with the bank (Channel_Code,Vintage, 'Avg_Asset_Value etc.)

##### In our dataset the columns represent the following:

* ID-Unique Identifier for a row

* Gender-Gender of the Customer

* Age-Age of the Customer (in Years)

* Region_Code-Code of the Region for the customers

* Occupation-Occupation Type for the customer

* Channel_Code-Acquisition Channel Code for the Customer  (Encoded)

* Vintage-Vintage for the Customer (In Months)

* Credit_Product-If the Customer has any active credit product (Home loan,Personal loan, Credit Card etc.)

* Avg_Account_Balance-Average Account Balance for the Customer in last 12 Months

* Is_Active-If the Customer is Active in last 3 Months

* Is_Lead(Target)-If the Customer is interested for the Credit Card

* 0 : Customer is not interested

* 1 : Customer is interested



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import *
from lightgbm import LGBMClassifier
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
train=pd.read_csv('/kaggle/input/prediction/train.csv')
test=pd.read_csv('/kaggle/input/prediction/test.csv')
sub=pd.read_csv('/kaggle/input/prediction/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe(include="all")

In [None]:
train.info()

In [None]:
train.dtypes

In [None]:
train.nunique()

## Exploratory Data Analysis

In [None]:
train.Age.plot.hist()

In [None]:
sns.pairplot(train)

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(train, title="Pandas Profiling Report", explorative=True)
profile

In [None]:
train.Is_Active.value_counts()

In [None]:
train.fillna("NA",inplace=True)
test.fillna("NA",inplace=True)

In [None]:
gender_mapper = {"Female":1,"Male":0}
train['Gender'] = train.Gender.map(gender_mapper)
test['Gender'] = test.Gender.map(gender_mapper)

In [None]:
occupation_mapper = {"Self_Employed":1,"Salaried":0,"Other":2,"Entrepreneur":3}
train['Occupation'] = train.Occupation.map(occupation_mapper)
test['Occupation'] = test.Occupation.map(occupation_mapper)

In [None]:
train['Region_Code'] = train.Region_Code.str[2:].astype(int)
test['Region_Code'] = test.Region_Code.str[2:].astype(int)

In [None]:
train['Channel_Code'] = train.Channel_Code.str[1:].astype(int)
test['Channel_Code'] = test.Channel_Code.str[1:].astype(int)

In [None]:
cp_mapper = {"No":1,"NA":0,"Yes":2}
train['Credit_Product'] = train.Credit_Product.map(cp_mapper)
test['Credit_Product'] = test.Credit_Product.map(cp_mapper)

In [None]:
is_active_mapper = {"Yes":1,"No":0}
train['Is_Active'] = train.Is_Active.map(is_active_mapper)
test['Is_Active'] = test.Is_Active.map(is_active_mapper)

In [None]:
train['age_bin'] = np.where((train.Age>40)&(train.Age<65),1,0)
test['age_bin'] = np.where((test.Age>40)&(test.Age<65),1,0)

In [None]:
selected_columns = ['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code',
       'Vintage', 'Credit_Product', 'Avg_Account_Balance', 'Is_Active']
len(selected_columns)

In [None]:
X = train[selected_columns]
y = train.Is_Lead

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42,stratify = y)

In [None]:
model =LGBMClassifier(
    boosting_type='gbdt',num_leaves=30,max_depth=7,learning_rate=0.1,n_estimators=150,subsample_for_bin=200000,objective=None,
    class_weight=None,min_split_gain=0.0,min_child_weight=0.001,min_child_samples=20,subsample=1.0,subsample_freq=0,
    colsample_bytree=1.0,reg_alpha=0.0,reg_lambda=0.0,random_state=42,n_jobs=-1,silent=True,importance_type='split')

In [None]:
model.fit(X_train,y_train)

In [None]:
model.predict(X_test[selected_columns])

In [None]:
roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=10,scoring = 'roc_auc')
print(scores,scores.mean())

In [None]:
model.fit(X,y)

In [None]:
preds = model.predict_proba(test[selected_columns])[:, 1]

In [None]:
result = pd.DataFrame()
result['ID'] = test.ID
result['Is_Lead'] = preds

In [None]:
result.describe(include="all")

In [None]:
result.to_csv("finalresult.csv",index=False)

## Gradient boosting is a powerful ensemble machine learning algorithm.
* It’s popular for structured predictive modeling problems, such as classification and regression
## After Evaluating the model with evaluation metrices with roc_auc_score
* LightGBM CLassifier Have got more accuracy of 0.8724473949682701

### Accuracy-87.24%