# Homework 3 - Classification

In [1]:
import os
import zipfile

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression




In [2]:
# get data 
data_url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
! wget $data_url

this_dir = os.getcwd()
data_path = os.path.join(this_dir, os.path.basename(data_url))

--2024-10-17 00:24:55--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip.3’

bank+marketing.zip.     [ <=>                ] 999.85K  5.33MB/s    in 0.2s    

2024-10-17 00:24:55 (5.33 MB/s) - ‘bank+marketing.zip.3’ saved [1023843]



In [3]:
with zipfile.ZipFile(data_path, 'r') as zip_ref:
    with zip_ref.open('bank.zip') as bank_zip_file:
        with zipfile.ZipFile(bank_zip_file) as inner_zip:
            with inner_zip.open('bank-full.csv') as csv_file:
                with open('bank-full.csv', 'wb') as output:
                    output.write(csv_file.read())

In [4]:

use_columns = [
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y'
]
df = pd.read_csv('bank-full.csv', sep=';', usecols=use_columns) 

In [5]:
df.shape

(45211, 15)

In [6]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [8]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [9]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print(f"num cols: {numerical_columns}")
category_columns = df.select_dtypes(include=['object']).columns
print(f"category cols: {category_columns}")

num cols: Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')
category cols: Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome', 'y'],
      dtype='object')


### Question 1

What is the most frequent observation (mode) for the column `education`?

- `unknown`
- `primary`
- `secondary`
- `tertiary`

In [10]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [11]:
df.education.mode()[0]

'secondary'

***Q1 Answer:*** `secondary` 

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `age` and `balance`
- `day` and `campaign`
- `day` and `pdays`
- `pdays` and `previous`


### Target encoding

* Now we want to encode the `y` variable.
* Let's replace the values `yes`/`no` with `1`/`0`.

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [12]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
correlation = df[numerical_columns].corr().unstack().sort_values(ascending=False).drop_duplicates()

In [13]:
correlation[correlation != 1.0].head(1)

previous  pdays    0.45482
dtype: float64

***Q2 Answer:*** `pdays` and `previous`

***Target encoding***: `yes`/`1`, `no`/ `0`

In [14]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [15]:
# split the data
SEED = 42
TRAIN_SIZE = 0.6
VAL_SIZE = 0.2
TEST_SIZE = 0.2


In [16]:
# split full train and test data
df_train_full, df_test = train_test_split(df, test_size=TEST_SIZE, shuffle=True, random_state=SEED)   

In [17]:
## split train and valid data
df_train, df_val = train_test_split(df_train_full, test_size=VAL_SIZE/(TRAIN_SIZE+VAL_SIZE), shuffle=True, random_state=SEED)   

In [18]:
df_train.shape[0] + df_test.shape[0] + df_val.shape[0] == df.shape[0]

True

In [19]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [20]:
df_train.head(2)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown,0
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown,0


In [21]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [22]:
# # remove y from category columns
category_columns = category_columns.drop('y')

In [23]:
mutual_info_score = df_train[category_columns].apply(lambda x: mutual_info_score(x, df_train.y), axis=0)
mutual_info_score.sort_values(ascending=False).round(2).head(1)

poutcome    0.03
dtype: float64

***Q3 Answer:*** `poutcome`

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [24]:
df_train = df_train.drop('y', axis=1)
df_val = df_val.drop('y', axis=1)
df_test = df_test.drop('y', axis=1)
df_train.head(2)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown


In [25]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [26]:
# fit regression model

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=SEED)
model.fit(X_train, y_train)

In [27]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)

In [28]:

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(f"accuracy: {accuracy}")

accuracy: 0.9


***Q4 Answer:*** `0.9`

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.

In [29]:
features =  df_train.columns.to_list()
features 


['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [30]:
accuracy_all_features = accuracy

In [31]:
scores_acc = {}

for feature in features:
    feats = features.copy()
    feats.remove(feature)
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[feats].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=SEED)
    model.fit(X_train, y_train)
    
    val_dict = df_val[feats].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    diff = accuracy_all_features - score
    diff_abs = np.abs(diff)

    scores_acc[feature] = {
        'score': score,
        'diff': diff,
        'diff_abs': diff_abs
    }

    print(f"{feature}, score: {score}, diff: {diff}")

age, score: 0.9011280690112807, diff: -0.001128069011280708
job, score: 0.9012386640123866, diff: -0.0012386640123865922
marital, score: 0.9012386640123866, diff: -0.0012386640123865922
education, score: 0.9007962840079629, diff: -0.000796284007962833
balance, score: 0.9011280690112807, diff: -0.001128069011280708
housing, score: 0.9002433090024331, diff: -0.00024330900243307862
contact, score: 0.900353904003539, diff: -0.0003539040035389629
day, score: 0.9011280690112807, diff: -0.001128069011280708
month, score: 0.8998009289980093, diff: 0.0001990710019906805
duration, score: 0.8895155938951559, diff: 0.010484406104844135
campaign, score: 0.9001327140013271, diff: -0.00013271400132708333
pdays, score: 0.9010174740101747, diff: -0.0010174740101747126
previous, score: 0.9009068790090687, diff: -0.0009068790090687173
poutcome, score: 0.8939393939393939, diff: 0.0060606060606061


In [32]:
# get min value from the dict 
min_diff_feature = min(scores_acc, key=lambda k: scores_acc[k]['diff_abs'])
min_diff_feature

'campaign'

***Q5 Answer:*** `campaign`

### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

In [33]:
c_values  = [0.01, 0.1, 1, 10, 100]
scores = {}
for c in c_values:
    dv = DictVectorizer(sparse=False)
    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=SEED)
    model.fit(X_train, y_train)

    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    scores[c] = round(score, 3)
    
    print(f"c: {c}, score: {score}")

c: 0.01, score: 0.8978102189781022
c: 0.1, score: 0.9007962840079629
c: 1, score: 0.9012386640123866
c: 10, score: 0.9009068790090687
c: 100, score: 0.900353904003539


In [34]:
print(f'Smallest `c` is {min(scores, key=scores.get)}.')

Smallest `c` is 0.01.
