In [1]:
import numpy
import pandas
import joblib
import json
import seaborn

import sklearn

%matplotlib inline

# Instructions

- Read **the test data** from the CSV file and properly set the index
- Load the trained model

In [2]:
data_test = pandas.read_csv('./data/n_high_salary_features.test.csv').set_index('id')
data_test

Unnamed: 0_level_0,marital-status,education-num
id,Unnamed: 1_level_1,Unnamed: 2_level_1
27867,0.0,1.358944
27868,2.0,-0.559105
27869,2.0,0.975334
27870,4.0,-0.175495
27871,2.0,0.591724
...,...,...
34862,4.0,-0.175495
34863,4.0,0.975334
34864,2.0,0.975334
34865,2.0,-0.559105


In [3]:
data_train = pandas.read_csv('./data/n_high_salary_features.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,marital-status,education-num,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8616,2.0,0.975334,1.0
21982,2.0,2.126163,1.0
11191,4.0,-0.559105,0.0
22229,0.0,-0.559105,0.0
20732,2.0,0.975334,1.0
...,...,...,...
21575,4.0,0.591724,0.0
5390,2.0,-3.627983,0.0
860,2.0,-0.175495,1.0
15795,4.0,-0.175495,0.0


In [4]:
model = joblib.load('model/model.joblib')
model

# Instructions
- Calculate and add the `prediction` column to the dataframe of test data
- Write the dataframe to `./results/predictions.test.csv`

In [5]:
data_test['prediction'] = model.predict(data_test.drop([], axis='columns'))
data_test

Unnamed: 0_level_0,marital-status,education-num,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27867,0.0,1.358944,1.0
27868,2.0,-0.559105,0.0
27869,2.0,0.975334,1.0
27870,4.0,-0.175495,0.0
27871,2.0,0.591724,1.0
...,...,...,...
34862,4.0,-0.175495,0.0
34863,4.0,0.975334,0.0
34864,2.0,0.975334,1.0
34865,2.0,-0.559105,0.0


In [6]:
data_train['prediction'] = model.predict(data_train.drop(['label'],axis='columns'))
data_train

Unnamed: 0_level_0,marital-status,education-num,label,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8616,2.0,0.975334,1.0,1.0
21982,2.0,2.126163,1.0,1.0
11191,4.0,-0.559105,0.0,0.0
22229,0.0,-0.559105,0.0,0.0
20732,2.0,0.975334,1.0,1.0
...,...,...,...,...
21575,4.0,0.591724,0.0,0.0
5390,2.0,-3.627983,0.0,0.0
860,2.0,-0.175495,1.0,1.0
15795,4.0,-0.175495,0.0,0.0


In [7]:
data_train.to_csv('./results/predictions.train.csv')
data_test.to_csv('./results/predictions.test.csv')

# Instructions
- Use `sklearn.metrics.confusion_matrix` to calculate the confusion matrix

In [8]:
confusion_maxtri_from_skelearn = sklearn.metrics.confusion_matrix(
     y_true = data_train['label'],
    y_pred = data_train['prediction'],
)
confusion_maxtri_from_skelearn 

array([[10217,  1918],
       [ 2961,  5804]])

# Instructions

- Use `sklearn.metrics.classification_report` to print the classification report
- Now, **manually fill** the following metrics `y=1` and put them in a Python dictionary
- The expected output should look like:

```
scores = {
     'true_positive': 109,
     'true_negative': 103,
     'false_negative': 19,
     'false_positive': 19,
     'accuracy': 0.848,
     'precision': 0.851562,
     'recall': 0.851562,
     'support': 128,
     'f1-score': 0.851562
}
```
- Use `json.dump` to save the output to  `./results/scores.test.json`


In [9]:
pandas.DataFrame(sklearn.metrics.classification_report(
     y_true = data_train['label'],
    y_pred = data_train['prediction'],
    digits = 4,
    output_dict = True,
))


Unnamed: 0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.775307,0.751619,0.766555,0.763463,0.765373
recall,0.841945,0.662179,0.766555,0.752062,0.766555
f1-score,0.807253,0.70407,0.766555,0.755662,0.76398
support,12135.0,8765.0,0.766555,20900.0,20900.0


In [10]:
scores = {
     'true_positive': 101,
     'true_negative': 92,
     'false_negative': 27,
     'false_positive': 30,
     'accuracy': 0.772,
     'precision': 0.770992,
     'recall': 0.789062,
     'support': 128,
     'f1-score': 0.779923
}

In [11]:
with open('./results/scores.test.json', 'w') as f:
    json.dump(scores, f, indent=4)