## Bank Marketing

## Import Essential libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Reading the data

In [4]:
#read the data 
df = pd.read_csv("bank-full.csv", na_values= "unknown")

# Description:
1.	17 columns & 45248 rows
2.	Columns:
1 - Age (numeric)
2 - Job : type of Job
3 - Marital : marital status 
4 – Education
5 - Default: has credit in default? (categorical: 'no','yes','unknown')
6 - Housing: has housing loan? (categorical: 'no','yes','unknown')
7 - Loan: has personal loan? (categorical: 'no','yes','unknown')
8 - contact: contact communication 
9 - month: last contact month of year 
10 - day: last contact day of the week
11 - duration:  last contact duration, in seconds (numeric)

Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
12 - campaign: number of contacts performed during this campaign 
13 - pdays: number of days that passed by after the client was last contacted 14 - previous: number of contacts performed before this campaign and for this client (numeric)
15 - poutcome: outcome of the previous marketing campaign 
16- y - has the client subscribed a term deposit? (binary: 'yes','no')
17- balance is the current balance of each customer


In [6]:
#check type/no of rows and columns/missing values in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45248 entries, 0 to 45247
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45248 non-null  int64 
 1   job        44959 non-null  object
 2   marital    45248 non-null  object
 3   education  43390 non-null  object
 4   default    45248 non-null  object
 5   balance    45248 non-null  int64 
 6   housing    45248 non-null  object
 7   loan       45248 non-null  object
 8   contact    32228 non-null  object
 9   day        45248 non-null  int64 
 10  month      45248 non-null  object
 11  duration   45248 non-null  int64 
 12  campaign   45248 non-null  int64 
 13  pdays      45248 non-null  int64 
 14  previous   45248 non-null  int64 
 15  poutcome   8276 non-null   object
 16  y          45248 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [7]:
#check no of rows and columns
df.shape

(45248, 17)

In [8]:
#first 5 rows
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


## Check Duplicates

In [9]:
#check duplicates
df.duplicated().sum()

37

In [10]:
#drop duplicates
df.drop_duplicates(inplace = True)

In [11]:
#check duplicates after drop
df.duplicated().sum()

0

## Check Missing Values

In [None]:
 #check which columns has na values
df.isna().any()

In [None]:
# check no of na values in each column
df.isna().sum()
#There are nulls in job/education/ contact and poutcome columns

In [None]:
#check percentage of na values in each column
((df.isna().sum())/len(df))*100

In [None]:
#check the effect of nulls on the target
df[df["job"].isnull() == True]["y"].value_counts()

In [None]:
#check the effect of nulls on the target
df[df["education"].isnull() == True]["y"].value_counts()

In [None]:
#check the effect of nulls on the target
df[df["contact"].isnull() == True]["y"].value_counts()

In [None]:
#check the effect of nulls on the target
df[df["poutcome"].isnull() == True]["y"].value_counts()

In [None]:
#drop unnecessary columns and that has missings exceed 50%
df.drop("poutcome",axis = 1, inplace= True)

In [None]:
#drop null rows that not exceeding 8%
df.dropna(subset=['job','education'], inplace = True)

In [None]:
#check values of contact columns to use in the next cell
df["contact"].value_counts()

In [None]:
#filling nulls by using Mode
df["contact"].fillna(df["contact"].mode()[0], inplace = True)

In [None]:
#check values of conotact column
df["contact"].value_counts()

In [None]:
#check columns type and nulls 
df.info()

## Univariate Analysis

In [None]:
#print columns of the data to use below
df.columns

In [None]:
#The distribution of the Age
sns.distplot(df["age"])
sns.set(rc={'figure.figsize':(8,8.27)})

In [None]:
sns.countplot(df["y"],palette= "Set2")

In [None]:
df["age"].value_counts().head(10)

In [None]:
#blue_collar is the most job category accepts the marketing campaign
df["job"].value_counts()

In [None]:
#Pie chart of all Cat columns
cat = list(df.select_dtypes(include = "object").columns)
for i in cat:
    df.groupby(i).size().plot(kind='pie',label = i,autopct='%1.0f%%',shadow=True)
    plt.show()
    sns.set(rc={'figure.figsize':(11,8.27)})

In [None]:
df.groupby("y").size().plot(kind='pie',label = "y",autopct='%1.0f%%',shadow=True)
plt.show()
sns.set(rc={'figure.figsize':(11,8.27)})

In [None]:
#Most people don't have a bank account
df["default"].value_counts()

In [None]:
#Most people don't have a bank account by visual.
sns.countplot(df["default"],palette= "Set2")

In [None]:
#Most people accept the marketing campaign in May, Jul Then Aug
df["month"].value_counts()

In [None]:
#Most people accept the marketing campaign in May, Jul Then Aug buy visualization
sns.countplot(df["month"],order = df['month'].value_counts().index,palette= "Set2")

In [None]:
#Most people don't accept marketing campaign
df["y"].value_counts()

In [None]:
#Most people don't accept marketing campaign by visualization
sns.countplot(df["y"],palette= "Set2")

In [None]:
#Most people have housing loans
df["housing"].value_counts()

In [None]:
#Most people have housing loans by visualization
sns.countplot(df["housing"],palette= "Set2")

In [None]:
#Most people are married
df["marital"].value_counts()

In [None]:
#Most people are married by visualization
sns.countplot(df["marital"],palette= "Set2")

In [None]:
#Most people have not loans by visualization
sns.countplot(df["loan"],palette= "Set2")

In [None]:
#Distributuin of campaign (no of contacts during this Campaign)
sns.distplot(df["campaign"])
sns.set(rc={'figure.figsize':(11,8.27)})

In [None]:
#Distributuin of duration (last contact duration in seconds)
sns.distplot(df["duration"],)
sns.set(rc={'figure.figsize':(11,8.27)})

In [None]:
sns.distplot(df["previous"])
sns.set(rc={'figure.figsize':(14,8.27)})

In [None]:
#Most people accept the Mrketing campaign in 20th of each month
sns.countplot(data=df, y = "day")
sns.set(rc={'figure.figsize':(14,8.27)})

In [None]:
# No of days from the last contact day shoudn't be Negative
df[df["pdays"] == -1]

In [None]:
len(df[df["pdays"] == -1])/df.shape[0]

In [None]:

df.drop(["pdays"],inplace=True,axis = 1)

In [None]:
df["duration_in_mins"] = df["duration"]/60

In [None]:
df[df["duration_in_mins"] > 15.00]["y"].value_counts()

In [None]:
df[df["duration_in_mins"] < 15.00]["y"].value_counts()

In [None]:
df[df["balance"] <0]["y"].value_counts()

In [None]:
df[df["balance"] > 1000]["y"].value_counts()

## Bivariate Analysis

In [None]:
#Correlation of columns
sns.heatmap(df.corr(), annot= True, robust=True)

In [None]:
sns.lineplot(x="age",data =df, y="job" )

In [None]:
sns.lineplot(x="age",data =df, y="balance",estimator='mean')

In [None]:
sns.lineplot(x="age",data =df, y="duration_in_mins")

In [None]:
df.pivot_table(columns="y", index = "education",values = "balance",aggfunc="sum")

In [None]:
df.pivot_table(columns="housing", index = "education",values = "balance",aggfunc="sum")

In [None]:
df.pivot_table(columns="y", index = "job",values = "balance",aggfunc="sum")

In [None]:
# Descriptive Statistics of the data
df.describe(include = "all")

In [None]:
import plotly.express as px

In [None]:
#box plot of all numerical columns
Num = list(df.select_dtypes(exclude= "object").columns)
for i in Num:
    print(i)
    x = px.box(data_frame= df, x= i,labels=i)
    print(x.show())

In [None]:
#sns.boxplot(data = df, x= "age")

In [None]:
#df[df["balance"] != 0]["balance"].value_counts().head(10)

In [None]:
    len(df[df["age"] > 80])

In [None]:
df.drop(labels = (df[df["age"] > 80]).index, axis = 0, inplace = True)

In [None]:
df["age"].max()

 ## Preprocessing

In [None]:
df.drop(labels = ["balance","duration"], axis = 1, inplace = True)

In [None]:
df = pd.get_dummies(data=df,drop_first=True)

In [None]:
df.columns

In [None]:
df.rename(columns = {'y_yes':'y'}, inplace = True)

In [None]:
x = df.drop(["y"],axis = 1)
y = df["y"]

In [None]:
from imblearn.under_sampling import RandomUnderSampler
sm = RandomUnderSampler(random_state=42)
x_res, y_res = sm.fit_resample(x, y)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_res,y_res,test_size= 0.2, random_state= 15,shuffle=True)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2, random_state= 15,shuffle=True)

In [None]:
from sklearn.preprocessing import RobustScaler
scaler= RobustScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score

model = LogisticRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(f1_score(y_test,y_predict))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = KNeighborsClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(f1_score(y_test,y_predict))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(f1_score(y_test,y_predict))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = SVC()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(f1_score(y_test,y_predict))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = RandomForestClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(f1_score(y_test,y_predict))

In [None]:
numeric_columns = x_train.select_dtypes(exclude = "object").columns

In [None]:
Categorical_columns = x_train.select_dtypes(include = "object").columns

In [None]:
from sklearn.preprocessing import RobustScaler #for scaling
from sklearn.impute import SimpleImputer #for handling missing values
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler

numeric_features = Pipeline(steps = [("scaling",RobustScaler(with_centering=False))])
Categorical_features = Pipeline(steps =[("handlingmissing", SimpleImputer(strategy= "most_frequent")),("Encoding",OneHotEncoder()),("scaling",RobustScaler(with_centering=False))])


In [None]:
from sklearn.compose import ColumnTransformer
processing = ColumnTransformer([("numeric",numeric_features,numeric_columns),("cat",Categorical_features,Categorical_columns)])

In [None]:
processing

In [None]:
from sklearn.ensemble import RandomForestClassifier
Final_pip = Pipeline(steps = [("preprocessing", processing),("modelling",RandomForestClassifier())])

In [None]:
Final_pip

In [None]:
Final_pip.fit(x_train,y_train)

In [None]:
Final_pip.predict(x_test)

In [None]:
import joblib
model_save = joblib.dump(Final_pip,"model.plk")

In [None]:
import joblib
model_save = joblib.load("model.plk")

In [None]:
model_save

In [None]:
import streamlit

In [None]:
df.columns

In [None]:
%%writefile app_1.py
import numpy as np
import pandas as pd
import streamlit as st
import joblib

regression  = joblib.load("model.plk")

def welcome():
    return "Welcome All"

def predict_deposit (age, job, marital, education, default, housing, loan, contact, day, month, duration_in_mins, campaign, pdays, previous):
    prediction = regression.predict(pd.DataFrame({'age':[age],'job':[job], 'marital':[marital], 'education':[education], 'default':[default],'housing':[housing],'loan':[loan],'contact':[contact],'day':[day],'month':[month],'duration_in_mins':[duration_in_mins],'campaign':[campaign],'pdays':[pdays],'previous':[previous]}))
    print(prediction)
    return prediction

def main():
    st.title("Bank Marketing")
    html_temp = """
    <div style ="background-color:tomato;padding;10px">
    <h2 stylr = "color:white;text-align:center;">streamlit Bank Marketing App</h2>
    </div>
    """
    st.markdown(html_temp,unsafe_allow_html =True)
    age = st.text_input("age","Type Here")
    job = st.text_input("job","Type Here")
    marital = st.text_input("marital","Type Here")
    education = st.text_input("education","Type Here")
    default = st.text_input("default","Type Here")
    housing = st.text_input("housing","Type Here")
    loan = st.text_input("loan","Type Here")
    contact = st.text_input("contact","Type Here")
    day = st.text_input("day","Type Here")
    month = st.text_input("month","Type Here")
    duration_in_mins = st.text_input("duration_in_mins","Type Here")
    campaign = st.text_input("campaign","Type Here")
    pdays = st.text_input("pdays","Type Here")
    previous = st.text_input("previous","Type Here")
    
    result = ""
    
    if st.button("predict"):
        result= predict_deposit(age, job, marital, education, default, housing, loan, contact, day, month, duration_in_mins, campaign, pdays, previous)
    st.success("The output is {}".format(result))

        
if __name__=='__main__':
    main()

In [None]:
! streamlit run app_1.py

In [None]:
df.head()