In [None]:
pip install -r requirements.txt

# The Case
A Portuguese banking institution wants to optimize its next marketing campaign for a term deposit subscription. They have a list of current clients. They would like to know, based on various features, which clients are the likeliest to buy a term deposit. 

## The Task
Train a model on the training data to predict which customers are likely to buy a term-deposit. Test this model on the data in bank_test.csv

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler

In [None]:
dataset = pd.read_csv("bank.csv")

In [None]:
dataset.head(5)

1. Age: Well, age (Numeric)
2. Job: Job (categorical: str)
3. Marital: Marital status (Values: [single, married, divorced]: str)
4. Education: Education level (Values: [primary, secondary, tertiary] :str)
5. Default: Whether or not a client has defaulted on a loan before (Values: [yes, no]: str)
6. Balance: Current account balance (Numeric)
7. Housing: Whether or not a client has a home loan (Values: [yes, no]: str)
8. Loan: Whether or not a client has a personal loan (Values: [yes, no]: str)
9. Contact: On which device, if at all, has a customer previously been contacted (Values: [cellular,
telehphone, unknown]: str)
10. Month: Last month of contact (Values: months: str)
11. Day: Day of last contact (Numeric)
12. Campaign: The number of contacts reaching the customer during the current campaign (including the last contact) (Numeric)
13. Pdays: The number of days since the previous campaign, if reached (-1 if it was never reached before) 
(Numeric)
14. Previous: The number of contacts that reached the customer before this campaign. (Numeric
15. Poutcome: Status of previous campaign. (Values: [succcess, failure, other, unknown] :str)

# Exploratory Data Analysis and Data Cleaning

In [None]:
#Helper Functions adapted from various kaggle notebooks
def plot_bar(variable):
    var =dataset[variable]
    varValue = var.value_counts()
    plt.figure(figsize=(15,3))
    plt.bar(varValue.index, varValue,color=['#00008b','#00e5ee','#cd1076', '#008080','#cd5555','red','blue',])
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    
    plt.show()
    print("{}: \n {}".format(variable,varValue))

def plot_hist(variable):
    plt.figure(figsize=(9,6))
    plt.hist(dataset[variable], bins=40,color='red')
    plt.xlabel(variable)
    plt.ylabel("frequency")
    plt.title("{} distrubition with hist".format(variable))
    plt.show()
    
def plot_corr_num(variable):
    pd.crosstab(dataset[variable],dataset.deposit).plot(kind="area",figsize=(15,7),color=['red','blue' ])
    plt.title(variable + '' +' Distribution')
    plt.xlabel(variable)
    plt.ylabel('Frequency')
    plt.show()
    
def plot_corr_cat(variable):
    pd.crosstab(dataset[variable],dataset.deposit).plot(kind="bar",figsize=(15,7),color=['blue','red'])
    plt.title(variable + ' Distribution')
    plt.xlabel(variable)
    plt.ylabel('Frequency')
    plt.show()
    
def one_hot(dataset):
    columns=dataset.select_dtypes(include=[object]).columns
    dataset=pd.concat([dataset,pd.get_dummies(dataset[columns])],axis=1)
    dataset=dataset.drop(['job','marital','education','default','housing','loan','contact','month','day','poutcome'],axis=1)
    dataset.info()
    dataset.head()
    return dataset

## EDA on Categorical data

## EDA on numerical data

## Analysing the output (response) variable

## Building the one-hot encodings

In [None]:
dataset = one_hot(dataset)
dataset.head(5)

### Clean some variables

In [None]:
#Example to clean binary variables

def deposit_clean(deposit_val):
    if(deposit_val=='yes'):
        return(1)
    elif(deposit_val=='no'):
        return(0)
dataset['depositNew'] = dataset['deposit'].apply(deposit_clean)
dataset=dataset.drop(['deposit'],axis=1)
dataset=dataset.drop(['deposit_no', 'deposit_yes'],axis=1)

In [None]:
#Feature engineering for numerical variable

def pdays_change(pdays_val):
    if(pdays_val == -1):
        return(0)
    elif(pdays_val >= 0):
        return(1)
dataset['pdays_new'] = dataset['pdays'].apply(pdays_change)
dataset=dataset.drop(['pdays'],axis=1)

## Scaling the numeric data (Use standardscaler)

# The Algorithms (the part we have been waiting for)

What you will want to do here is to play with each algorithm (I will introduce you to a basic implementation), and tweak parameters to improve accuracy. Use the docs, tweak at random, do it however you will. We will then compare all the algorithms side by side. Lastly, we will run the algorithms on a test dataset. 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.2, random_state = 100)

accuracies = {}
f1scores={}

## Decision Tree Classifier

## Stochastic Gradient Descent Classifier

## Logistic Regression

## Comparing accuracies

In [None]:
#Helper function for viz
def score_compare(score_dict, score_name):
    colors = ["red", "green", "blue", "cyan","yellow",'black']

    sns.set_style("whitegrid")
    plt.figure(figsize=(16,5))
    plt.yticks(np.arange(0,100,3))
    plt.ylabel(score_name)
    plt.xlabel("\n\n Algorithms")
    sns.barplot(x=list(score_dict.keys()), y=list(score_dict.values()), palette=colors)
    plt.show()

# Test Data results

Run your predictions on test data and see which model performs the best on completely unseen data.

In [None]:
test_df = pd.read_csv('bank_test.csv')