# **(1) SETUP**

**Importing the Library**

In [31]:
# importing the pandas library 
import pandas as pd 
# importing matplotlib library
import matplotlib.pyplot as plt 
# importing numpy library
import numpy as np 
# importing seaborn library
import seaborn as sns 
# importing preprocessing module from sklearn library
from sklearn import preprocessing 
%matplotlib inline

---
# **(2) Import Test Dataset**

| Enter file name and path in read.csv() to load test data

---



In [32]:
# importing the dataset from csv format and storing it in dataframe named 'df'
df = pd.read_csv('.csv') 

# **(3) Separating numerical and categorical data from the input features**

In [33]:
# Extracting Numerical data from the dataset
numerical_data = df.select_dtypes(include=[np.number]) 
# Extracting Categorical data from the dataset
categorical_data = df.select_dtypes(exclude=[np.number]) 

# **(4) DATA PRE-PROCESSING**

## **Converting categorical data to numeric data using encoding**
---

### **Label Encoding**

---






**FOR ORDINAL CATEGORICAL VARIABLES**


> For our dataset, we have only 1 ordinal categorical variable i.e 'education' - having categories : primary, secondary, tertiary which is converted to 0,1 and 2 respectively



In [None]:
# creating an object label_encoder of class 'LabelEncoder' to use it for applying label encoding
label_encoder = preprocessing.LabelEncoder() 
print(categorical_data.head())
# Applying Label Encoding to 'education' column
temp = label_encoder.fit_transform(categorical_data['education'])
print('\n')
# Replacing the original values with the encoded values
categorical_data['education']= temp
# View the dataset
print(categorical_data.head())

**FOR BINARY CATEGORICAL VARIABLES**



> For 'default', 'housing' and 'loan' features, we convert yes to 1 and no to 0



In [None]:
# Creating an object label_encoder of class 'LabelEncoder' to use it for applying label encoding
label_encoder = preprocessing.LabelEncoder()

# View data before conversion
print(categorical_data.head())

# identifying number of unique values in each categorical variable. So that binary categorical variables can be used to convert to 0,1 using label encoding
for i in categorical_data:
  print(i,': ',categorical_data[i].nunique())
  # check for binary categorical variables i.e features having just 2 classes
  if categorical_data[i].nunique()==2:
    # Applying Label Encoding and store it in new column
    categorical_data.loc[:,i+' (Label Encoding)']= label_encoder.fit_transform(categorical_data[i])
    # copy the values from new column to old column in order to replace the original categorical values by numerical values
    categorical_data.loc[:,i] = categorical_data.loc[:,i+' (Label Encoding)']
    # delete the newly created columns once the values are copied
    del categorical_data[i+' (Label Encoding)']

# View data after conversion
print(categorical_data.head())

### **ONE-HOT/DUMMY ENCODING FOR NOMINAL CATEGORICAL VARIABLES**

For 'job', 'marital', 'contact' and 'poutcome' variables, we apply one-hot encoding technique



In [None]:
# View data before conversion
print(categorical_data.head())
for i in categorical_data:
  print(i,': ',categorical_data[i].nunique())
  # check for multiple categorical variables i.e features having more than 2 classes but excluding 'education' where we have already applied label encoding 
  if categorical_data[i].nunique()!=2 and i!='education':
    # Applying One Hot Encoding
    dummies = pd.get_dummies(categorical_data[i]) 
    # renaming columns to avoid duplicate names like 'unknown' which exists in each of poutcome, education, contact, etc
    dummies = dummies.add_prefix(i+': ')  
    # Avoiding the dummy trap by removing the last extra column generated
    dummies.drop(columns=dummies.columns[-1], axis=1,  inplace=True)
    # Adding the newly generated columns to the original dataset
    categorical_data = pd.concat([categorical_data,dummies],axis=1)
    # Delete the original categorical column 
    del categorical_data[i]

# View data after conversion
print(categorical_data.head())

## **Standardization of numerical data**

---



In [None]:
# Standardizing all numerical values 
# importing the StandardScaler module from sklearn.preprocessing package
from sklearn.preprocessing import StandardScaler

# Creating an object of StandardScaler()
object= StandardScaler()
 
# Applying fit_transform() to standardize the numerical data and save it in 'scale'
scale = object.fit_transform(numerical_data) 

# Standardized data is stored in dataframe with appropriate column names
scaled_numerical_data = pd.DataFrame(scale, columns = numerical_data.columns)

# Printing the standardized values
print(scaled_numerical_data)

## **Combining Processed Dataset and implementing Feature Selection to use with ML model**

In [38]:
# Combining processed categorical data and numerical data to get the final processed dataset
processed_x = pd.concat([scaled_numerical_data,categorical_data],axis=1)
processed_x.reset_index(drop=True)


# Dropping 50% of the features having low feature importance values

processed_x.drop(['job: student', 'poutcome: failure', 'job: unemployed', 'job: entrepreneur', 'default', 'poutcome: other', 'job: self-employed', 'job: housemaid'],axis = 1, inplace=True)
# Dropping "duration" feature as it will not be used for training the model
processed_x.drop(["duration"],axis = 1, inplace=True)
# Checking the shape of the final input features features which will be used for training the ML models
print(processed_x.shape)


(4000, 20)


# **(5) MODEL PREDICTION**

In [None]:
# Importing pickle package to load ML model
import pickle
# Loading the trained model by reading the binary file
loaded_model = pickle.load(open('Model_final.pkl','rb'))

# Rearranging the columns in the format required for the model (i.e in the format how the model was trained using training dataset)
processed_x = processed_x[['age', 'balance', 'day', 'campaign', 'pdays', 'previous', 'education','housing', 'loan', 'job: admin.', 'job: blue-collar', 'job: management','job: retired', 'job: services', 'job: technician', 'marital: divorced','marital: married', 'contact: cellular', 'contact: telephone','poutcome: success']]
# Predicting output
test = loaded_model.predict(processed_x)
# Converting the output values into dataframe and giving it column name 'y'
y = pd.DataFrame(test,columns=['y'])

# Converting the output into categorical format i.e 1->'yes' and 0->'no'
y['y'] = y['y'].map({1:'yes', 0:'no'})
# Merging input features and predicted output into single dataframe
final_dataset = pd.concat([df,y],axis=1)

# View the predicted values in the dataset
print(final_dataset.head())
