# Data Science Challenge

In [None]:
# If you'd like to install packages that aren't installed by default, uncomment the last two lines of this cell and replace <package list> with a list of your packages.
# This will ensure your notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [None]:
#Libraries
import pandas as pd
pd.set_option("display.max_columns", 101)
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data Description


Column | Description
:---|:---
`client_id` | Unique ID of the client called [unique key]
`age_bracket` | Age bracket of the contacted client (in years)
`job` | job type of the contacted client
`marital` | marital status of the contacted client
`education` | highest level of education done by the client
`has_housing_loan` | Whether the client has a house loan (binary: yes,no)
`has_personal_loan` | Whether the client has a personal loan (binary: yes,no)
`prev_call_duration` | last contact duration (value = 0 if the client has not been contacted ever)
`days_since_last_call` | number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
`num_contacts_prev` | number of contacts performed before this campaign and for this client (numeric)
`poutcome` | outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
`contact_date` | date at which contact was made with the client (YYYY-MM-DD)
`cpi` | standing consumer price index before the call (monthly indicator)
`subs_deposit` | has the client subscribed to a term deposit? (binary: 1,0) [dependent variable]

## Data Wrangling & Visualization

In [None]:
# Loading Train data
data_train = pd.read_csv("train.csv", parse_dates = ['contact_date'])
data_train.head()


In [None]:
#Loading Test data
data_test=pd.read_csv("test.csv",parse_dates = ['contact_date'])
print(data_test.shape)
data_test.head()

In [None]:
#Explore columns
data_train.columns

In [None]:
#Description
data_train.describe(percentiles = [0.05,0.5,0.95,0.975,0.99])

In [None]:
#check if any columns has null values
data_train.isnull().sum()

In [None]:
#check if any columns has NA values
data_train.isna().sum()

## Visualization, Modeling, Machine Learning

 Can you help Lending Bank to predict whether a client would subscribe to the term deposit and explain how different features affect that? Please explain your findings effectively to technical and non-technical audiences using comments and visualizations, if appropriate.
- **Build an optimized model that effectively solves the business problem.**
- **The model would be evaluated on the basis of F1 score.**
- **Read the test.csv file and prepare features for testing.**

#### Verifying data in each column 

In [None]:
data_train.subs_deposit.value_counts()

In [None]:
data_train.subs_deposit.value_counts(normalize=True)

#### 60% of clients in the dataset haven't subscribed a term deposit

In [None]:
data_train.age_bracket.value_counts()

In [None]:
data_test.age_bracket.value_counts()

In [None]:
data_train.job.value_counts()

In [None]:
data_test.job.value_counts()

In [None]:
data_train.marital.value_counts()

In [None]:
data_test.marital.value_counts()

In [None]:
data_train.education.value_counts()

In [None]:
data_test.education.value_counts()

In [None]:
data_train.has_housing_loan.value_counts()

In [None]:
data_test.has_housing_loan.value_counts()

In [None]:
data_train.has_personal_loan.value_counts()

In [None]:
data_test.has_personal_loan.value_counts()

In [None]:
data_train.prev_call_duration.value_counts()

In [None]:
data_train.days_since_last_call.value_counts()

In [None]:
data_train[data_train.days_since_last_call==999]

In [None]:
data_test[data_test.days_since_last_call==999]

In [None]:
data_train.num_contacts_prev.value_counts()

In [None]:
data_train.poutcome.value_counts()

In [None]:
data_train.cpi.value_counts()

In [None]:
sns.countplot(data_train.subs_deposit)
plt.show()

#### Majority of clients havent subscribed to a term deposit as per the dataset

In [None]:
sns.countplot(data=data_train, x='age_bracket', hue= 'subs_deposit')

#### Clients in the age bracket of 60+ and 18-24 are more like to have term deposits than others. Dataset has more population of clients in the age group of 41-60 and 25-60

In [None]:
ax_dim = (10, 9)
fig, ax = plt.subplots(figsize=ax_dim)
sns.countplot(ax = ax, data=data_train, x='job', hue= 'subs_deposit')

Clients whose job is not specified is more likey to have term deposits compared to others


In [None]:

sns.countplot(data=data_train, x='marital', hue= 'subs_deposit')

Marital Status doesnt seem to have any impact having a term deposit or not

In [None]:
ax_dim = (10, 9)
fig, ax = plt.subplots(figsize=ax_dim)
sns.countplot(ax= ax, data=data_train, x='education', hue= 'subs_deposit')

Majority of the clients have education level of secondary school or higher

In [None]:
sns.countplot(data=data_train, x='has_housing_loan', hue= 'subs_deposit')

In [None]:
sns.countplot(data=data_train, x='has_personal_loan', hue= 'subs_deposit')

In [None]:
sns.countplot(data=data_train, x='poutcome', hue= 'subs_deposit')

In [None]:
#Plot numerical features

import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
data_train.hist(bins=20, figsize=(14,10), color='#E14906')
plt.show()

* client_id has no relevance to determine the target variable here.
* cpi is between 90 and 100 for most of the records except a few outliers with more than 900
* prev_call_duration is mostly witiin 1000 seconds except some outliers 


In [None]:
data_train.info()

In [None]:
def process_feature_data(df):
    '''
    Method to process features in a dataframe
    Create an additional feature based on the column days_since_last_call 
    Create additional features based contact data
    Drop unnecessary columns
    '''
    #Create an additional feature based on the column days_since_last_call
    
    df['prev_contact_flag'] = np.where(df.days_since_last_call == 999, 0,1) 
    df['days_since_last_call'].replace({999: 0}, inplace = True) 
    
    #Create additional features based contact data
    df['contact_year'] = pd.DatetimeIndex(df['contact_date']).year
    df['contact_month'] = pd.DatetimeIndex(df['contact_date']).month
    df['contact_day'] = pd.DatetimeIndex(df['contact_date']).day
    
    #Drop unnecessary columns
    drop_features = ['client_id','contact_year', 'contact_date']
    df.drop(drop_features, axis=1,inplace = True)
       
    return df


In [None]:
train_df = data_train.copy(deep = True)
train_df = process_feature_data(train_df)
train_df.info()

In [None]:
# Converting categorical columns TO Dummy Variables and drop unnecessary columns
train_df = pd.get_dummies(train_df)

In [None]:
plt.figure(figsize=(16,8))
train_df.corr()['subs_deposit'].sort_values()[:-1].plot(kind='bar')
plt.show()

In [None]:
train_df.info()

In [None]:
#Get features and target variables
X = train_df.drop(['subs_deposit'], axis=1)
y = train_df['subs_deposit']
features = X.columns
X.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix, f1_score, accuracy_score

# scaler = StandardScaler()
# X = scaler.fit_transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
models = []

lr = LogisticRegression(penalty='l2', C=1, solver='lbfgs')
dtree = DecisionTreeClassifier(criterion= 'gini', min_samples_split=8,
                                  min_samples_leaf = 4, max_features = 'auto')
rfc = RandomForestClassifier(n_estimators=400)
gbc = GradientBoostingClassifier(n_estimators=400, max_depth=5)

models.extend([lr,dtree, rfc,gbc])

for model in models:
    print(model)
    model.fit(x_train, y_train)
    ypred = model.predict(x_test)
    print("accuracy_score: %0.2f" %(accuracy_score(y_test, ypred)))
    print()
    print('Confusion Matrix')
    print(confusion_matrix(y_test, ypred))
    print()
    print("f1_score: %0.2f" %(f1_score(y_test, ypred)))
    print()
    print('*******************************************')

GradientBoostingClassifier model has given best f1_score and accuracy score



> #### Task:
- **Visualize the top 20 features and their feature importance.**



In [None]:
importances = gbc.feature_importances_    
feature_importances = pd.DataFrame({'feature':features, 'importance':importances})
feature_importances.sort_values(by='importance', ascending=False, inplace=True)
#set index to 'feature'
feature_importances.set_index('feature', inplace=True, drop=True)
feature_importances.head(20)

In [None]:
feature_importances[0:10].plot.bar(figsize=(20,10))
plt.show()

##### Plot Predicted Vs Actual Data Distribution

In [None]:
def plot_predicated_actual_distribution(model, x, y):
    y_pred = model.predict(x)
    figure, axes = plt.subplots(ncols=2)
    figure.set_size_inches(10, 4)
    sns.countplot(y, ax=axes[0])
    axes[0].set_title('Actual Data Distribution')
    sns.countplot(y_pred, ax=axes[1])
    axes[1].set_title('Predicted Data Distribution')
    
    print('Y')
    print(y[0:5])
    print('Y_PRED')
    print(y_pred[0:5])

In [None]:
y_predict_gbm = rfc.predict(x_test)

plot_predicated_actual_distribution(gbc, x_test, y_predict_gbm)

> #### Task:
- **Submit the predictions on the test dataset using your optimized model** <br/>
    For each record in the Test set (test.csv), you must predict the 'subs_deposit' variable (1/0). The 1/0 would depend on the **best F1 score**.
    You should submit a CSV file with test entries plus a header row. Your submission will show an error if you have extra columns beyond 'client_id' and 'subs_deposit' or extra rows.
The file (`submission.csv`) should have exactly 2 columns:
    - **client_id**
    - **subs_deposit** (contains 1/0)

In [None]:
test_df = data_test.copy(deep = True)
#process features from test data set
test_df = process_feature_data(test_df)

In [None]:
test_df.info()

In [None]:
#Change categorical test features to numeric variables
test_df = pd.get_dummies(test_df)

In [None]:
test_df.info()

In [None]:
#Predict test data using GradientBoostingClassifier
yT = gbc.predict(test_df)

In [None]:
data_sub = data_test.copy(deep = True)
data_sub['subs_deposit'] = yT
data_sub.head()

In [None]:
data_sub.subs_deposit.value_counts(normalize = True)

In [None]:
data_sub.subs_deposit.value_counts()

In [None]:
data_sub[['client_id','subs_deposit']].tail()

In [None]:
data_sub[['client_id','subs_deposit']].head()

In [None]:
#Submission
data_sub[['client_id','subs_deposit']].to_csv('submission.csv',index=False)