In [60]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Preparing the Data

### Training Data

In [78]:
#importing the data 
data = pd.read_csv('../data/datingData/training.txt', header = None, sep = "\t" )
data.columns = ['flyer_miles', 'video_game_time', 'litres_iceCream', 'type']

In [79]:
data.head()

Unnamed: 0,flyer_miles,video_game_time,litres_iceCream,type
0,40920,8.326976,0.953952,largeDoses
1,14488,7.153469,1.673904,smallDoses
2,26052,1.441871,0.805124,didntLike
3,75136,13.147394,0.428964,didntLike
4,38344,1.669788,0.134296,didntLike


In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   flyer_miles      600 non-null    int64  
 1   video_game_time  600 non-null    float64
 2   litres_iceCream  600 non-null    float64
 3   type             600 non-null    object 
dtypes: float64(2), int64(1), object(1)
memory usage: 18.9+ KB


In [81]:
#checking the types of people
data['type'].unique()

array(['largeDoses', 'smallDoses', 'didntLike'], dtype=object)

In [82]:
#separting the data into x_train, y_train
print("shape of original data : ", data.shape)
x_train = data[data.columns[0:3]]
y_train = data['type']
print("shape of x_train       : ", x_train.shape)
print("shape of y_train       : ", y_train.shape)

shape of original data :  (600, 4)
shape of x_train       :  (600, 3)
shape of y_train       :  (600,)


### Converting labels into Integers

In [83]:
#converting the class labels from categorical to integer 
mask0 = y_train == 'largeDoses'
mask1 = y_train == 'smallDoses'
mask2 = y_train == 'didntLike'

y_train[mask0] = 0
y_train[mask1] = 1
y_train[mask2] = 2

In [84]:
#checking if the class labels are balanced
from collections import Counter 
print(Counter(y_train))

Counter({2: 220, 0: 200, 1: 180})


### Test Data

In [85]:
#loading the test dataset
data_test = pd.read_csv('../data/datingData/test.txt', header = None, sep = "\t" )
data_test.columns = ['flyer_miles', 'video_game_time', 'litres_iceCream', 'type']
data_test.head()

Unnamed: 0,flyer_miles,video_game_time,litres_iceCream,type
0,9916,2.695935,1.512111,smallDoses
1,38889,3.356646,0.32423,didntLike
2,39075,14.677836,0.793183,largeDoses
3,48071,1.551934,0.130902,didntLike
4,7275,2.464739,0.223502,smallDoses


In [86]:
#separating the test_dataset into x_test and y_test
print("shape of test data    : ", data_test.shape)
x_test = data_test[data.columns[0:3]]
y_test = data_test['type']
print("shape of x_test       : ", x_test.shape)
print("shape of y_test       : ", y_test.shape)

shape of test data    :  (400, 4)
shape of x_test       :  (400, 3)
shape of y_test       :  (400,)


In [87]:
#converting the class labels from categorical to integer 
mask0 = y_test == 'largeDoses'
mask1 = y_test == 'smallDoses'
mask2 = y_test == 'didntLike'

y_test[mask0] = 0
y_test[mask1] = 1
y_test[mask2] = 2

print(Counter(y_test))

Counter({1: 151, 0: 127, 2: 122})


## Classification using Multi-Label Logistic Regression

In [99]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [89]:
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()



[[4.092000e+04 8.326976e+00 9.539520e-01]
 [1.448800e+04 7.153469e+00 1.673904e+00]
 [2.605200e+04 1.441871e+00 8.051240e-01]]
[0 1 2]


In [106]:
print(x_train.shape)
print(y_train.shape)
y_train = y_train.astype('int')
print(type(y_train[0]))
print(type(x_train[0][0]))

(600, 3)
(600,)
<class 'numpy.int32'>
<class 'numpy.float64'>


In [118]:
#define the model 
model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs')

#repeated random test-train splits
cv_kFold = KFold(n_splits = 10, random_state = 77)

#scoring 
scores = cross_val_score(model, x_train, y_train, cv = cv_kFold)
print(scores)
print("Range of Scores: ", np.min(scores).round(2), np.max(scores).round(2))
print("Mean: ", scores.mean()*100.0)

[0.63333333 0.61666667 0.66666667 0.56666667 0.66666667 0.81666667
 0.65       0.9        0.58333333 0.73333333]
Range of Scores:  0.57 0.9
Mean:  68.33333333333333


- **Here we can observe that our scores range from 57% to 90%, which means that our model doesn't deliver good accuracy on all sections of data** 


- **To reduce this range and have a more consistant performance, we will perform some hyper-paramter tuning**

In [119]:
#changing the solver 
model = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')

#repeated random test-train splits
cv_kFold = KFold(n_splits = 10, random_state = 77)

#scoring 
scores = cross_val_score(model, x_train, y_train, cv = cv_kFold)
print(scores)
print("Range of Scores: ", np.min(scores).round(2), np.max(scores).round(2))
print("Mean: ", scores.mean()*100.0)

[0.93333333 0.88333333 0.88333333 0.88333333 0.9        0.86666667
 0.95       0.95       0.95       0.91666667]
Range of Scores:  0.87 0.95
Mean:  91.16666666666666


- **Now we can observe our scores range from 87% to 95% which is much smaller range than before. We also have a better mean accuracy** 


- **Hence, we can safely conclude that the model with updated hyperparameters is better and more consistant on different sections of data** 


- **Therfore, we will now fit the model using the same parameters on whole training data**

In [120]:
#fitting model on whole training data
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [123]:
#Testing on test_data
result = model.score(x_test, y_test)
print("Accuracy on test dataset: " , result * 100)

Accuracy on test dataset:  92.75


In [126]:
#get the confusion matrix 
y_pred = model.predict(x_test)

from sklearn import metrics
print(metrics.confusion_matrix(y_test, y_pred))



[[110   1  16]
 [  3 148   0]
 [  6   3 113]]


In [127]:
print(metrics.classification_report(y_test, y_pred, digits = 2))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       127
           1       0.97      0.98      0.98       151
           2       0.88      0.93      0.90       122

    accuracy                           0.93       400
   macro avg       0.92      0.92      0.92       400
weighted avg       0.93      0.93      0.93       400

