In [1]:
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import svm
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

df_complete = pd.read_csv("final_final_file.csv", 
                 sep = '\t')

In [2]:
df_complete.head()

Unnamed: 0,text,sentiment,bin_sentiment
0,something that i have been noticing recently i...,Catastrophizing,1
1,i went on vacation and just got back yesterday...,Catastrophizing,1
2,i've had crippling depression since i started ...,Jumping to conclusion,1
3,i feel like confusion is a huge part to depres...,Not distored,0
4,i skipped both of my classes yesterday. i had ...,Not distored,0


In [3]:
df_complete['bin_sentiment'].value_counts()

1    1187
0     744
Name: bin_sentiment, dtype: int64

In [4]:
#https://www.kdnuggets.com/2020/05/dataset-splitting-best-practices-python.html

X, y = df_complete['text'], df_complete['bin_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size = 0.7, 
                                                    random_state = 42,
                                                    stratify = y)

## LOG + BOW 

In [5]:
# training
count_vectorizer = CountVectorizer(analyzer = "word", 
                                   tokenizer = nltk.word_tokenize,
                                   preprocessor = None, 
                                   stop_words = 'english', 
                                   min_df = 3)

train_data_features = count_vectorizer.fit_transform(X_train)

In [6]:
log_reg_BOW = LogisticRegression().fit(train_data_features, y_train)

test_data_features = count_vectorizer.transform(X_test)
y_pred = log_reg_BOW.predict(test_data_features)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6277    0.5291    0.5742       223
           1     0.7321    0.8039    0.7664       357

    accuracy                         0.6983       580
   macro avg     0.6799    0.6665    0.6703       580
weighted avg     0.6920    0.6983    0.6925       580

[0.52914798 0.80392157]


# Grid: LOG + BOW

In [46]:
penalty = ['none', 'l1', 'l2', 'elasticnet']
C_first = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_second = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08]
solver = ['liblinear', 'saga', 'newton-cg', 'lbfgs']

param_grid = dict(penalty = penalty,
                  C = C_second,
                  solver = solver)

#{'C': 0.0001, 'penalty': 'l1', 'solver': 'saga'}
grid = GridSearchCV(estimator = log_reg_BOW,
                    param_grid = param_grid,
                    scoring ='f1', #'recall'; ‘accuracy’; ‘precision’, 'f1'
                    verbose = 1,
                    n_jobs = -1)
grid_result = grid.fit(train_data_features, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   16.3s finished


First eun: Best Score:  0.7636214412147668
Best Params:  {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

Second run: Best Score:  0.7686640959363799
Best Params:  {'C': 0.04, 'penalty': 'l2', 'solver': 'liblinear'}

In [8]:
log_reg_BOW = LogisticRegression(C = 0.04, penalty ='l2', 
                                   solver = 'liblinear').fit(train_data_features, y_train)

data_features = count_vectorizer.transform(X_test)
y_pred = log_reg_BOW.predict(data_features)

In [12]:
accuracy_score(y_test, y_pred)

0.7068965517241379

In [14]:
print(classification_report(y_test,y_pred, digits = 4))

print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6667    0.4753    0.5550       223
           1     0.7221    0.8515    0.7815       357

    accuracy                         0.7069       580
   macro avg     0.6944    0.6634    0.6682       580
weighted avg     0.7008    0.7069    0.6944       580

Confusion Matrix : 
[[106 117]
 [ 53 304]]
[0.47533632 0.85154062]


## Classification log_reg_TF-IDF

In [12]:
tf_vect = TfidfVectorizer(min_df = 3,
                          tokenizer = nltk.word_tokenize,
                          preprocessor = None, 
                          stop_words = 'english')

TF_IDF_train_data_features = tf_vect.fit_transform(X_train)

logreg_TF_IDF = LogisticRegression().fit(TF_IDF_train_data_features, y_train)

In [13]:
data_features_TF_IDF = tf_vect.transform(X_test)
y_pred = logreg_TF_IDF.predict(data_features_TF_IDF)

In [14]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6853    0.4395    0.5355       223
           1     0.7140    0.8739    0.7859       357

    accuracy                         0.7069       580
   macro avg     0.6996    0.6567    0.6607       580
weighted avg     0.7029    0.7069    0.6896       580

[0.43946188 0.87394958]


## log_reg_TF-IDF Hyper parameter tuning

In [15]:
penalty = ['none', 'l1', 'l2', 'elasticnet']
C_first = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_second = [1, 2, 3, 4, 5, 6, 7, 8, 9]
solver = ['liblinear', 'saga', 'newton-cg', 'lbfgs']

param_grid = dict(penalty = penalty,
                  C = C_second,
                  solver = solver)

grid = GridSearchCV(estimator = logreg_TF_IDF,
                    param_grid = param_grid,
                    scoring = 'f1', #'recall'; ‘accuracy’; ‘precision’
                    verbose = 1,
                    n_jobs = -1)
grid_result = grid.fit(TF_IDF_train_data_features, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    5.9s


Best Score:  0.7734083935075498
Best Params:  {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:   20.9s finished


First attempt:
Best Score:  0.7734083935075498
Best Params:  {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}

Second attempt:
Best Score:  0.7734083935075498
Best Params:  {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}

In [16]:
logreg_TF_IDF = LogisticRegression(C = 1, penalty ='l2', 
                                   solver = 'newton-cg').fit(TF_IDF_train_data_features, y_train)

data_features_TF_IDF = tf_vect.transform(X_test)
y_pred = logreg_TF_IDF.predict(data_features_TF_IDF)

In [17]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6853    0.4395    0.5355       223
           1     0.7140    0.8739    0.7859       357

    accuracy                         0.7069       580
   macro avg     0.6996    0.6567    0.6607       580
weighted avg     0.7029    0.7069    0.6896       580

[0.43946188 0.87394958]


## Method 2 - Support Vector Machine

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

### SVM + BoW

In [40]:
svc_bow = SVC().fit(train_data_features, y_train)

test_data_features = count_vectorizer.transform(X_test)
y_pred = svc_bow.predict(test_data_features)

print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6892    0.4574    0.5499       223
           1     0.7199    0.8711    0.7883       357

    accuracy                         0.7121       580
   macro avg     0.7045    0.6643    0.6691       580
weighted avg     0.7081    0.7121    0.6967       580

[0.4573991  0.87114846]


#### Grid:SVM + BoW

In [42]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_second = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
gamma = [0.0001, 0.001, 0.01, 0.00001, 0.000001, 0.00000001]
gamma_second = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009]
kernel = ['linear', 'poly', 'rbf', 'sigmoid'] #‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’

param_grid = dict(C = C,
                  gamma = gamma,
                  kernel = kernel)

grid = GridSearchCV(estimator = svc_bow,
                    param_grid = param_grid,
                    scoring = 'f1', #'recall'; ‘accuracy’; ‘precision’
                    verbose = 1,
                    n_jobs = -1)
grid_result = grid.fit(train_data_features, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  2.9min finished


Best Score:  0.7623286121848878
Best Params:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


First run: Best Score:  0.7658457534195632
Best Params:  {'C': 100, 'gamma': 0.0003, 'kernel': 'sigmoid'}

Second run:

In [None]:
svc_bow = SVC().fit(train_data_features, y_train)

test_data_features = count_vectorizer.transform(X_test)
y_pred = svc_bow.predict(test_data_features)

In [None]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

### SVM + tf-idf

In [18]:
svc_tfidf = SVC().fit(TF_IDF_train_data_features, y_train)
data_features_TF_IDF = tf_vect.transform(X_test)
y_pred = svc_tfidf.predict(data_features_TF_IDF)

In [19]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6861    0.4215    0.5222       223
           1     0.7088    0.8796    0.7850       357

    accuracy                         0.7034       580
   macro avg     0.6975    0.6505    0.6536       580
weighted avg     0.7001    0.7034    0.6840       580

[0.42152466 0.87955182]


#### Grid:SVM + tf-idf

In [20]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_second = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
gamma = [0.0001, 0.001, 0.01, 0.00001, 0.000001, 0.00000001]
gamma_second = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009]
kernel = ['linear', 'poly', 'rbf', 'sigmoid'] #‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’

param_grid = dict(C = C,
                  gamma = gamma_second,
                  kernel = kernel)

grid = GridSearchCV(estimator = svc_tfidf,
                    param_grid = param_grid,
                    scoring = 'f1', #'recall'; ‘accuracy’; ‘precision’
                    verbose = 1,
                    n_jobs = -1)
grid_result = grid.fit(TF_IDF_train_data_features, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

First attempt (C+gamma):
Best Score:  0.7621745438286351
Best Params:  {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}

Second attempt (C_second + gamma):
Best Score:  0.7748284594685082
Best Params:  {'C': 5000, 'gamma': 0.0001, 'kernel': 'sigmoid'}

Third attempt(C_second + gamma_second):
Best Score:  0.7748284594685082
Best Params:  {'C': 1000, 'gamma': 0.0005, 'kernel': 'sigmoid'}

4th attempt (C+gamma_second):
Best Score:  0.7748284594685082
Best Params:  {'C': 1000, 'gamma': 0.0005, 'kernel': 'sigmoid'}

In [21]:
svc_tfidf = SVC(C = 1000, 
                gamma = 0.0005, kernel = 'sigmoid').fit(TF_IDF_train_data_features, y_train)
data_features_TF_IDF = tf_vect.transform(X_test)
y_pred = svc_tfidf.predict(data_features_TF_IDF)

In [23]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.7023    0.4126    0.5198       223
           1     0.7082    0.8908    0.7891       357

    accuracy                         0.7069       580
   macro avg     0.7053    0.6517    0.6544       580
weighted avg     0.7060    0.7069    0.6855       580

[0.41255605 0.8907563 ]


## doc2vec

In [24]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [25]:
train_data, test_data = train_test_split(df_complete)

train_tagged = train_data.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), 
                             tags=[r['bin_sentiment']]), axis = 1)

test_tagged = test_data.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), 
                             tags=[r['bin_sentiment']]), axis = 1)

trainsent = train_tagged.values

doc2vec_model = Doc2Vec(trainsent, 
                        vector_size = 300, 
                        window = 5, #The maximum distance between the current and predicted word within a sentence. 
                        workers = 4,
                        epochs = 50,
                        dm = 1)

testsent = test_tagged.values
y_train, X_train = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in trainsent])
y_test, X_test = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in testsent])

In [26]:
logreg_doc2vec = LogisticRegression().fit(X_train, y_train)
y_pred = logreg_doc2vec.predict(X_test)

In [27]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6014    0.4564    0.5190       195
           1     0.6836    0.7951    0.7352       288

    accuracy                         0.6584       483
   macro avg     0.6425    0.6258    0.6271       483
weighted avg     0.6504    0.6584    0.6479       483

[0.45641026 0.79513889]


## Grid: log + doc2vec

In [26]:
penalty = ['none', 'l1', 'l2', 'elasticnet']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_second = [10, 20, 30, 40, 50, 60, 70, 80, 90]
C_third = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
solver = ['liblinear', 'saga', 'newton-cg', 'lbfgs']

param_grid = dict(penalty = penalty,
                  C = C_third,
                  solver = solver)

grid = GridSearchCV(estimator = logreg_doc2vec,
                    param_grid = param_grid,
                    scoring ='f1', #'recall'; ‘accuracy’; ‘precision’
                    verbose = 1,
                    n_jobs = -1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:  3.1min


Best Score:  0.9141558036022648
Best Params:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.7min finished


First iteration
Best Score:  0.9141558036022648
Best Params:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

Second iteration
Best Score:  0.9141558036022648
Best Params:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

Third iteration
Best Score:  0.9141558036022648
Best Params:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

In [28]:
logreg_doc2vec = LogisticRegression(C = 10, penalty = 'l2', 
                                    solver = 'liblinear').fit(X_train, y_train)

In [29]:
y_pred = logreg_doc2vec.predict(X_test)

In [30]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.5844    0.4615    0.5158       195
           1     0.6809    0.7778    0.7261       288

    accuracy                         0.6501       483
   macro avg     0.6326    0.6197    0.6209       483
weighted avg     0.6419    0.6501    0.6412       483

[0.46153846 0.77777778]


# SVM + doc2vec

In [31]:
SVM_doc2vec = SVC().fit(X_train, y_train)
y_pred = SVM_doc2vec.predict(X_test)

In [32]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.6519    0.4513    0.5333       195
           1     0.6925    0.8368    0.7579       288

    accuracy                         0.6812       483
   macro avg     0.6722    0.6440    0.6456       483
weighted avg     0.6761    0.6812    0.6672       483

[0.45128205 0.83680556]


# Grid: SVM + doc2vec

In [38]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_second = [1, 2, 3, 5, 6, 7, 8, 9]
gamma = [0.0001, 0.001, 0.01, 0.00001, 0.000001]
gamma_second = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']

param_grid = dict(C = C,
                  gamma = gamma_second,
                  kernel = kernel)

grid = GridSearchCV(estimator = SVM_doc2vec,
                    param_grid = param_grid,
                    scoring = 'f1',
                    verbose = 1,
                    n_jobs = -1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  6.5min finished


Best Score:  0.9155718555831166
Best Params:  {'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}


Results:
First iteration (C + gamma)
Best Score:  0.9155718555831166
Best Params:  {'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}

Second iteration (C_second + gamma)
Best Score:  0.9155718555831166
Best Params:  {'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}

Third iteration (C_second + gamma_second)
Best Score:  0.9155718555831166
Best Params:  {'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}

In [33]:
SVM_doc2vec = SVC(C = 1, gamma = 0.0001, kernel = "linear").fit(X_train, y_train)
y_pred = SVM_doc2vec.predict(X_test)

In [34]:
print(classification_report(y_test,y_pred, digits = 4))

#print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())

              precision    recall  f1-score   support

           0     0.5828    0.4513    0.5087       195
           1     0.6777    0.7812    0.7258       288

    accuracy                         0.6480       483
   macro avg     0.6302    0.6163    0.6172       483
weighted avg     0.6394    0.6480    0.6381       483

[0.45128205 0.78125   ]
