In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)

In [2]:
# Reading the CSV File
bank_df = pd.read_csv('bank_additional_full.csv',sep=';',na_values='unknown')

In [3]:
# Getting the dataset info:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             40858 non-null  object 
 2   marital         41108 non-null  object 
 3   education       39457 non-null  object 
 4   default         32591 non-null  object 
 5   housing         40198 non-null  object 
 6   loan            40198 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

## Processing the Object/Categorical Dataset

In [33]:
# Segregating the Object/Categorical columns
cat_df = bank_df.select_dtypes(include='object')

In [34]:
# Getting the dataset info.
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job          40858 non-null  object
 1   marital      41108 non-null  object
 2   education    39457 non-null  object
 3   default      32591 non-null  object
 4   housing      40198 non-null  object
 5   loan         40198 non-null  object
 6   contact      41188 non-null  object
 7   month        41188 non-null  object
 8   day_of_week  41188 non-null  object
 9   poutcome     41188 non-null  object
 10  y            41188 non-null  object
dtypes: object(11)
memory usage: 3.5+ MB


In [36]:
# Getting rid of the response variable y from the categorical Dataframe cat_df
cat_df = cat_df.drop(columns='y')

In [62]:
# Segregating the categorical columns having null values
cat_df_null = cat_df.loc[:,cat_df.isnull().any()]

In [63]:
# Visualizig the above dataframe
cat_df_null.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job        40858 non-null  object
 1   marital    41108 non-null  object
 2   education  39457 non-null  object
 3   default    32591 non-null  object
 4   housing    40198 non-null  object
 5   loan       40198 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [83]:
# Segregating the categorical columns having No Null values
cat_df_notnull = cat_df.loc[:,~cat_df.isnull().any()]

In [84]:
# Visualizing the above dataframe
cat_df_notnull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   contact      41188 non-null  object
 1   month        41188 non-null  object
 2   day_of_week  41188 non-null  object
 3   poutcome     41188 non-null  object
dtypes: object(4)
memory usage: 1.3+ MB


#### Creating the dummies variables for categorical columns without missing values

In [85]:
# Getting the Dummies
cat_df_notnull = pd.get_dummies(cat_df_notnull.astype('category'),drop_first=True,prefix=cat_df_notnull.columns)

In [86]:
cat_df_notnull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   contact_telephone     41188 non-null  uint8
 1   month_aug             41188 non-null  uint8
 2   month_dec             41188 non-null  uint8
 3   month_jul             41188 non-null  uint8
 4   month_jun             41188 non-null  uint8
 5   month_mar             41188 non-null  uint8
 6   month_may             41188 non-null  uint8
 7   month_nov             41188 non-null  uint8
 8   month_oct             41188 non-null  uint8
 9   month_sep             41188 non-null  uint8
 10  day_of_week_mon       41188 non-null  uint8
 11  day_of_week_thu       41188 non-null  uint8
 12  day_of_week_tue       41188 non-null  uint8
 13  day_of_week_wed       41188 non-null  uint8
 14  poutcome_nonexistent  41188 non-null  uint8
 15  poutcome_success      41188 non-null  uint8
dtypes: u

#### Getting the dummies for categorical columns with missing values

In [64]:
# Getting the info of the object/categorical columns having missing values.
cat_df_null.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job        40858 non-null  object
 1   marital    41108 non-null  object
 2   education  39457 non-null  object
 3   default    32591 non-null  object
 4   housing    40198 non-null  object
 5   loan       40198 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [68]:
# Getting the dummies (without dropping the first column using drop_first=True as NaN columns are automatically dropped)
cat_df_null = pd.get_dummies(cat_df_null.astype('category'),prefix=cat_df_null.columns)

In [69]:
# Visulazing the resulting dataset
cat_df_null.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   job_admin.                     41188 non-null  uint8
 1   job_blue-collar                41188 non-null  uint8
 2   job_entrepreneur               41188 non-null  uint8
 3   job_housemaid                  41188 non-null  uint8
 4   job_management                 41188 non-null  uint8
 5   job_retired                    41188 non-null  uint8
 6   job_self-employed              41188 non-null  uint8
 7   job_services                   41188 non-null  uint8
 8   job_student                    41188 non-null  uint8
 9   job_technician                 41188 non-null  uint8
 10  job_unemployed                 41188 non-null  uint8
 11  marital_divorced               41188 non-null  uint8
 12  marital_married                41188 non-null  uint8
 13  marital_single  

In [75]:
# Standardizing the column names of the new dummied dataframe
cat_df_null.columns = cat_df_null.columns.str.strip().str.replace('.','_')

In [79]:
# Correcting a single column name from job_admin. to job_admin
cat_df_null.rename(columns = {'job_admin_':'job_admin'}, inplace = True) 

In [80]:
cat_df_null.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   job_admin                      41188 non-null  uint8
 1   job_blue-collar                41188 non-null  uint8
 2   job_entrepreneur               41188 non-null  uint8
 3   job_housemaid                  41188 non-null  uint8
 4   job_management                 41188 non-null  uint8
 5   job_retired                    41188 non-null  uint8
 6   job_self-employed              41188 non-null  uint8
 7   job_services                   41188 non-null  uint8
 8   job_student                    41188 non-null  uint8
 9   job_technician                 41188 non-null  uint8
 10  job_unemployed                 41188 non-null  uint8
 11  marital_divorced               41188 non-null  uint8
 12  marital_married                41188 non-null  uint8
 13  marital_single  

### Concatenating the above 2 categorical dataframes to produce a single one.

In [87]:
# Concatenating cat_df_notnull and cat_df_null to produce final categorical dataframe cat_df_final
cat_df_final = pd.concat([cat_df_notnull,cat_df_null],axis=1)

In [88]:
cat_df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 43 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   contact_telephone              41188 non-null  uint8
 1   month_aug                      41188 non-null  uint8
 2   month_dec                      41188 non-null  uint8
 3   month_jul                      41188 non-null  uint8
 4   month_jun                      41188 non-null  uint8
 5   month_mar                      41188 non-null  uint8
 6   month_may                      41188 non-null  uint8
 7   month_nov                      41188 non-null  uint8
 8   month_oct                      41188 non-null  uint8
 9   month_sep                      41188 non-null  uint8
 10  day_of_week_mon                41188 non-null  uint8
 11  day_of_week_thu                41188 non-null  uint8
 12  day_of_week_tue                41188 non-null  uint8
 13  day_of_week_wed 

### Merging both Numerical & Categorical columns to create a final Dataset.

In [89]:
# Reading the Numerical Column Dataset.
num_df = pd.read_csv('Num_df.csv')

In [90]:
# Checking the Numerical Datset
num_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   duration        41188 non-null  float64
 2   campaign        41188 non-null  int64  
 3   pdays           41188 non-null  int64  
 4   previous        41188 non-null  int64  
 5   emp_var_rate    41188 non-null  float64
 6   cons_price_idx  41188 non-null  float64
 7   cons_conf_idx   41188 non-null  float64
 8   euribor3m       41188 non-null  float64
 9   nr_employed     41188 non-null  float64
 10  y               41188 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 3.5 MB


In [91]:
# Concatenating both Numerical as well as Categorical Dataset to produce a  Final Dataset
final_df = pd.concat([num_df,cat_df_final],axis=1)

In [92]:
# Checking the final dataset
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 54 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41188 non-null  int64  
 1   duration                       41188 non-null  float64
 2   campaign                       41188 non-null  int64  
 3   pdays                          41188 non-null  int64  
 4   previous                       41188 non-null  int64  
 5   emp_var_rate                   41188 non-null  float64
 6   cons_price_idx                 41188 non-null  float64
 7   cons_conf_idx                  41188 non-null  float64
 8   euribor3m                      41188 non-null  float64
 9   nr_employed                    41188 non-null  float64
 10  y                              41188 non-null  int64  
 11  contact_telephone              41188 non-null  uint8  
 12  month_aug                      41188 non-null 

In [94]:
# Saving the Final dataset as the CSV file.
final_df.to_csv('Final_df.csv',index=False)

In [95]:
# Saving the Final Categorical dataframe as CSV.
cat_df_final.to_csv('Cat_df.csv',index=False)