# Advanced Classification Part 1 - Exercises with Results

## Exercise 1

#### Task 1 
##### Load libraries that are used in this module.

#### Result:

In [3]:
import os
import pickle
import matplotlib.pyplot as plt                     
import numpy as np                                    
import pandas as pd
from textwrap import wrap
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from pathlib import Path

#### Task 2

##### Define the directory settings.


#### Result:

In [4]:
# Set 'main_dir' to location of the project folder
home_dir = Path(".").resolve()
main_dir = home_dir.parent.parent
print(main_dir)
data_dir = str(main_dir) + "/data"
print(data_dir)

/Users/aashika/Desktop/skillsoft-advanced-classification-2022
/Users/aashika/Desktop/skillsoft-advanced-classification-2022/data


#### Task 3
##### Load the dataset `bank_marketing.csv` and save it as `bank`.
##### View the first few rows and column names of `bank`.

#### Result:

In [5]:
bank = pd.read_csv(str(data_dir) + "/bank_marketing.csv")

In [6]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no


In [7]:
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
       'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')

#### Task 4 
##### Check the data type of each variable.
##### Let's convert `y` into a categorical target variable.
##### Replace the column `y` in the dataframe, by setting it to 1 if `y` is 'yes', otherwise set `y` to 0.
#### Result:

In [8]:
# Check datatype
print(bank.dtypes)

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays             float64
previous            int64
poutcome           object
emp_var_rate      float64
cons_price_idx    float64
cons_conf_idx     float64
euribor3m         float64
nr_employed         int64
y                  object
dtype: object


In [9]:
# Convert y to 0/1 values
bank['y'] = np.where(bank ['y'] == 'yes',1,0 )

In [10]:
# Check for NAs.
print(bank.isnull().sum())

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             1
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64


#### Task 5

##### Look for NAs in `bank` and drop the columns with >50% NAs.
##### Check for NAs after again to make sure the data looks good.
##### Keep the numerical columns `age`,`campaign`,`emp_var_rate`,`cons_price_idx`,`cons_conf_idx`,`euribor3m`,`nr_employed`,`y`.


#### Result:


In [11]:
# Drop 'pdays'
bank.drop(['pdays'], axis = 1, inplace = True)

In [12]:
print(bank.shape)

(41188, 20)


In [13]:
bank = bank[["age","campaign","emp_var_rate","cons_price_idx","cons_conf_idx","euribor3m","nr_employed","y"]]

#### Task 6
##### In order to reduce time taken to train the models,
##### Replace `bank` with a datframe containing 2000 rows randomly sampled from `bank` using the following code: 

```bank = bank.sample(n=2000, random_state=1)```

#### Result:

In [14]:
# Randomly sample 2000 rows from bank
bank = bank.sample(n=2000, random_state=1)

## Exercise 2

#### Task 1
##### Select the predictors by dropping variable `y` and save the result to a dataframe `X_ex`.
##### Save the target variable `y` column to `y_ex` variable.
##### Set seed as 1.
##### Split the data into train and test sets and save respective variables to `X_train_ex`, `X_test_ex`, `y_train_ex`, `y_test_ex`.

#### Result:

In [15]:
# Select the predictors and target.
X_ex = bank.drop(['y'], axis = 1)
y_ex = np.array(bank['y'])

# Set the seed to 1.
np.random.seed(1)

# Split into training and test sets.
X_train_ex, X_test_ex, y_train_ex, y_test_ex = train_test_split(X_ex, y_ex, test_size = 0.3)

#### Task 2
##### Create a random forest classifier and save it to `forest_ex` variable.
##### Set random state to 1, number of estimators to 100 and `gini` as the criterion in the model.
##### Fit the classifier to our training data.

#### Result:

In [16]:
forest_ex = RandomForestClassifier(criterion = 'gini', n_estimators = 100, 
                                random_state = 1)

In [17]:
forest_ex.fit(X_train_ex, y_train_ex)

RandomForestClassifier(random_state=1)

#### Task 3
##### Predict on the test data and print the first 5 predictions on the test data.

#### Result:

In [18]:
# Predict on test data.
y_predict_forest_ex = forest_ex.predict(X_test_ex)

# Look at the first few predictions.
print(y_predict_forest_ex[0:5, ])

[1 1 0 1 0]
