# Explore here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, SplineTransformer
from sklearn.metrics import accuracy_score

df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv", sep=";")

In [9]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,cons.price.idx_upper_outliers,cons.conf.idx_lower_outliers,cons.conf.idx_upper_outliers,euribor3m_lower_outliers,euribor3m_upper_outliers,log_duration,sqrt_duration,few_campaigns,many_campaigns,cat_euribor3m
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,0,0,0,0,0,5.564524,16.155525,1,1,"(1.344, 4.857]"
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,0,0,0,0,0,5.003953,12.206597,1,1,"(1.344, 4.857]"
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,0,0,0,0,0,5.420539,15.033330,1,1,"(1.344, 4.857]"
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,0,0,0,0,0,5.017286,12.288246,1,1,"(1.344, 4.857]"
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,0,0,0,0,0,5.726851,17.521444,1,1,"(1.344, 4.857]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,0,0,0,0,0,5.811144,18.275694,1,1,"(-inf, 1.344]"
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,0,0,0,0,0,5.948038,19.570411,1,1,"(-inf, 1.344]"
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,0,0,0,0,0,5.241752,13.747763,1,1,"(-inf, 1.344]"
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,0,0,0,0,0,6.091312,21.023820,1,1,"(-inf, 1.344]"


In [3]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [3]:
numerical_col = ["age","duration","campaign", "pdays","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"]


Univariate Data Analysis

In [None]:
numerical_col = ["age","duration","campaign", "pdays","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"]
for col in range(len(numerical_col)):
    print(f"EDA for the columns {numerical_col[col]}")
    print(df[numerical_col[col]].describe())
    print(f"Are there missing values for {numerical_col[col]}?")
    print(df[numerical_col[col]].isna().sum())
    q25 = df[numerical_col[col]].quantile(0.25)
    q75 = df[numerical_col[col]].quantile(0.75)
    IQR = q75-q25
    df[numerical_col[col]+"_lower_outliers"] = np.where(df[numerical_col[col]] <q25-3*IQR, 1, 0 )
    df[numerical_col[col]+"_upper_outliers"] = np.where(df[numerical_col[col]] <q75-3*IQR, 1, 0 )
    print(f"The coefficent of variations is {100*df[numerical_col[col]].std()/df[numerical_col[col]].mean()} %")

    #Figure and axis
    fig, (ax1, ax2) = plt.subplots(2,1, sharex= False, gridspec_kw={'height_ratios': [3,1]})

    sns.histplot(df[numerical_col[col]],kde=True,ax=ax1)
    ax1.set_title('Histogram and Boxplot')
    ax1.set_ylabel('Frequency')

    sns.boxplot(df[numerical_col[col]],ax=ax2)
    ax2.set_xlabel('Data')
    ax2.set_ylabel('Boxplot')


Transforming the data for visability 



In [None]:
df['campaign'].value_counts()
df['log_duration'] = np.log(df['duration']+0.001)
df['sqrt_duration'] = np.sqrt(df['duration']+0.001)
df['few_campaigns'] = np.where(df["campaign"]<12,1,0)
df['many_campaigns'] = np.where(df["campaign"]<28,1,0)
df['pdays'] = np.where(df['pdays']<999,1,0)
df['emp.var.rate'] = pd.cut(df['emp.var.rate'],[-4, -3, 0, 2])
quantiles = df["euribor3m"].quantile([0.25,0.5,0.75])
df["cat_euribor3m"] = pd.cut(df["euribor3m"], [-float("inf"), quantiles[0.25], quantiles[0.50], quantiles[0.75], float("inf")])

Numerical Features

In [None]:
numerical_features =["age","duration","campaign", "pdays","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed",'age_lower_outliers', 'age_upper_outliers', 'duration_lower_outliers',
       'duration_upper_outliers', 'campaign_lower_outliers',
       'campaign_upper_outliers', 'pdays_lower_outliers',
       'pdays_upper_outliers', 'emp.var.rate_lower_outliers',
       'emp.var.rate_upper_outliers', 'cons.price.idx_lower_outliers',
       'cons.price.idx_upper_outliers', 'cons.conf.idx_lower_outliers',
       'cons.conf.idx_upper_outliers', 'euribor3m_lower_outliers',
       'euribor3m_upper_outliers','log_duration', 'sqrt_duration',
       'few_campaigns', 'many_campaigns']

Categorical Features

In [None]:
categorical_col = ["job","marital","education","default","housing","loan","contact","month","day_of_week","poutcome","cat_euribor3m"]
for col in range(len(categorical_col)):
    df[categorical_col[col]] = pd.Categorical(df[categorical_col[col]])

In [None]:

#Figure and axis
fig, (ax1, ax2) = plt.subplots(2,1, sharex= False, gridspec_kw={'height_ratios': [3,1]})

sns.histplot(df['log_duration'],kde=True,ax=ax1)
ax1.set_title('Histogram and Boxplot')
ax1.set_ylabel('Frequency')

sns.boxplot(df['log_duration'],ax=ax2)
ax2.set_xlabel('Data')
ax2.set_ylabel('Boxplot')


Design Matrix

In [None]:
X_num = df[numerical_features]
X_cat = df[categorical_col]
