# Testing baseline model with `CountVectorizer()`

## Load data

In [1]:
import pandas as pd
df = pd.read_csv('./data/train.csv')
X = df['text']
Y = df['suicide']

## Pre-processing

### WordVectorization with `CountVectorizer()`

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X_set = vectorizer.transform(X)

In [3]:
print("Shape of X:", X_set.shape)
print("Shape of Y:", Y.shape)

Shape of X: (185659, 146321)
Shape of Y: (185659,)


## Building the baseline model

For testing purposes, we will use the Logistic Regression classifier `LogisticRegression()` with its default settings as our baseline model.

In [4]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=10000)

## Evaluating the Baseline Model

### 5-fold Stratified Cross-validation

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
accuracy_score_list, recall_score_list, precision_score_list, f1_score_list = [], [], [], []

for time, (train_index, test_index) in enumerate(skfolds.split(X_set, Y)):
    X_train, X_test = X_set[train_index], X_set[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)

    # Evaluate model
    AccuracyScore = accuracy_score(Y_test, y_pred)
    RecallScore = recall_score(Y_test, y_pred)
    PrecisionScore = precision_score(Y_test, y_pred)
    F1Score = f1_score(Y_test, y_pred)

    # Add to lists
    accuracy_score_list.append(AccuracyScore)
    recall_score_list.append(RecallScore)
    precision_score_list.append(PrecisionScore)
    f1_score_list.append(F1Score)

    # Print the matrix
    print('Time: ', time + 1)
    print('\taccuracy_score: {:.4f}'.format(AccuracyScore))
    print('\trecall_score: {:.4f}'.format(RecallScore))
    print('\tprecision_score: {:.4f}'.format(PrecisionScore))
    print('\tf1_score: {:.4f}'.format(F1Score))

Time:  1
	accuracy_score: 0.9364
	recall_score: 0.9251
	precision_score: 0.9465
	f1_score: 0.9357
Time:  2
	accuracy_score: 0.9340
	recall_score: 0.9221
	precision_score: 0.9445
	f1_score: 0.9332
Time:  3
	accuracy_score: 0.9358
	recall_score: 0.9234
	precision_score: 0.9470
	f1_score: 0.9350
Time:  4
	accuracy_score: 0.9346
	recall_score: 0.9246
	precision_score: 0.9435
	f1_score: 0.9339
Time:  5
	accuracy_score: 0.9331
	recall_score: 0.9201
	precision_score: 0.9448
	f1_score: 0.9323


In [6]:
import numpy as np

# Print the average value of each matrix
print("Accuracy: {:.2%}".format(np.average(accuracy_score_list)))
print("Recall: {:.2%}".format(np.average(recall_score_list)))
print("Precision: {:.2%}".format(np.average(precision_score_list)))
print("F1_score: {:.2%}".format(np.average(f1_score_list)))

Accuracy: 93.48%
Recall: 92.31%
Precision: 94.52%
F1_score: 93.40%


## Save the model

In [7]:
import pickle

clf.fit(X_set, Y)

print(clf.n_iter_)

with open('./model/LR_tfidf.pickle', 'wb') as f:
    pickle.dump(clf, f)

[175]


using code below to load the fitted model:

``` python
with open('./model/LR_tfidf.pickle', 'rb') as f:
    clf = pickle.load(f)
```

In [8]:
vocabulary = vectorizer.vocabulary_

print(clf.coef_[:, vocabulary['happy']])
print(clf.coef_[:, vocabulary['sad']])

[-0.39526121]
[-1.40760916]


In [9]:
n_feats_to_show = 10

# Flip the index so that values are keys and keys are values:
keys = vectorizer.vocabulary_.values()
values = vectorizer.vocabulary_.keys()
vocab_inverted = dict(zip(keys, values))

for c, weights_c in enumerate(clf.coef_):
    print(f'\nWeights for class {c}:\n')
    strongest_idxs = np.argsort(weights_c)[-n_feats_to_show:]

    for idx in strongest_idxs:
        print(f'{vocab_inverted[idx]} with weight {weights_c[idx]}')


Weights for class 0:

alive with weight 6.09891424630061
end with weight 6.826371198324343
mei with weight 6.966087497314663
life with weight 7.137644291173986
killing with weight 7.177212623579987
die with weight 7.484886073292844
pills with weight 8.251502189073777
kill with weight 10.459847896200468
suicidal with weight 13.777891466483391
suicide with weight 16.435349080808024
