In [68]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
np.random.seed(42)

In [23]:
bdf = pd.read_csv("featured_bank.csv")

In [24]:
dropped_columns= ['s.no','age','duration','emp.var.rate','cons.conf.idx','euribor3m','nr.employed']
categorical_columns=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','f.euribor','f.age','f.duration','f.pattern']

In [25]:
#columns before dropping
bdf.columns

Index(['s.no', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'f.euribor', 'f.age',
       'f.duration', 'f.pattern'],
      dtype='object')

In [26]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   s.no            41188 non-null  int64  
 1   age             41188 non-null  int64  
 2   job             41188 non-null  object 
 3   marital         41188 non-null  object 
 4   education       41188 non-null  object 
 5   default         41188 non-null  object 
 6   housing         41188 non-null  object 
 7   loan            41188 non-null  object 
 8   contact         41188 non-null  object 
 9   month           41188 non-null  object 
 10  day_of_week     41188 non-null  object 
 11  duration        41188 non-null  int64  
 12  campaign        41188 non-null  int64  
 13  pdays           41188 non-null  int64  
 14  previous        41188 non-null  int64  
 15  poutcome        41188 non-null  object 
 16  emp.var.rate    41188 non-null  float64
 17  cons.price.idx  41188 non-null 

In [27]:
# before dropping the columns, keep a copy
bdf_copy = bdf.copy()

In [28]:
# drop the columns
bdf.drop(dropped_columns, axis=1, inplace=True)

In [29]:
bdf.shape

(41188, 19)

In [30]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  object 
 1   marital         41188 non-null  object 
 2   education       41188 non-null  object 
 3   default         41188 non-null  object 
 4   housing         41188 non-null  object 
 5   loan            41188 non-null  object 
 6   contact         41188 non-null  object 
 7   month           41188 non-null  object 
 8   day_of_week     41188 non-null  object 
 9   campaign        41188 non-null  int64  
 10  pdays           41188 non-null  int64  
 11  previous        41188 non-null  int64  
 12  poutcome        41188 non-null  object 
 13  cons.price.idx  41188 non-null  float64
 14  y               41188 non-null  object 
 15  f.euribor       41188 non-null  object 
 16  f.age           41188 non-null  object 
 17  f.duration      41188 non-null 

In [31]:
#convert columns of object type to categorical columns
bdf_cat = bdf[categorical_columns].astype('category')

In [32]:
# info after converting to categorical columns
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  object 
 1   marital         41188 non-null  object 
 2   education       41188 non-null  object 
 3   default         41188 non-null  object 
 4   housing         41188 non-null  object 
 5   loan            41188 non-null  object 
 6   contact         41188 non-null  object 
 7   month           41188 non-null  object 
 8   day_of_week     41188 non-null  object 
 9   campaign        41188 non-null  int64  
 10  pdays           41188 non-null  int64  
 11  previous        41188 non-null  int64  
 12  poutcome        41188 non-null  object 
 13  cons.price.idx  41188 non-null  float64
 14  y               41188 non-null  object 
 15  f.euribor       41188 non-null  object 
 16  f.age           41188 non-null  object 
 17  f.duration      41188 non-null 

In [33]:
bdf_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   job          41188 non-null  category
 1   marital      41188 non-null  category
 2   education    41188 non-null  category
 3   default      41188 non-null  category
 4   housing      41188 non-null  category
 5   loan         41188 non-null  category
 6   contact      41188 non-null  category
 7   month        41188 non-null  category
 8   day_of_week  41188 non-null  category
 9   poutcome     41188 non-null  category
 10  f.euribor    41188 non-null  category
 11  f.age        41188 non-null  category
 12  f.duration   41188 non-null  category
 13  f.pattern    41188 non-null  category
dtypes: category(14)
memory usage: 568.1 KB


In [35]:
# drop bdf categorical columns from the dataframe
bdf_noncat = bdf.drop(categorical_columns,axis=1)

In [36]:
bdf_noncat.columns

Index(['campaign', 'pdays', 'previous', 'cons.price.idx', 'y'], dtype='object')

In [37]:
# use one hot encoding for categorical columns
bdf_cat_one_hot = pd.get_dummies(bdf_cat)

In [39]:
bdf_cat_one_hot.shape

(41188, 117)

In [62]:
#concat categorical df with non categorical df
bdf_master = pd.concat([bdf_noncat, bdf_cat_one_hot], axis=1)

In [63]:
bdf_master.head()

Unnamed: 0,campaign,pdays,previous,cons.price.idx,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,f.pattern_PE1N,f.pattern_PE1O,f.pattern_PE4A,f.pattern_PE4B,f.pattern_PE4C,f.pattern_PE4D,f.pattern_PE4E,f.pattern_PE4F,f.pattern_PE4G,f.pattern_no
0,1,999,0,93.994,no,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1,999,0,93.994,no,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,999,0,93.994,no,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,999,0,93.994,no,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,999,0,93.994,no,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [64]:
# get the target variable column
y = bdf_master.pop('y')

In [65]:
bdf_master.shape

(41188, 121)

In [66]:
# remove the target variable column from the master dataset
X = bdf_master

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=101)

In [70]:
X_train.shape

(30891, 121)

In [71]:
y_test.shape

(10297,)

In [73]:
y_train.value_counts()

no     27416
yes     3475
Name: y, dtype: int64

In [74]:
y_test.value_counts()

no     9132
yes    1165
Name: y, dtype: int64

In [75]:
# the propotion of yes and no looks the same