# Testing baseline model with `CountVectorizer()`

## Load data

In [1]:
import pandas as pd
df = pd.read_csv('./data/train.csv')
X = df['text']
Y = df['suicide']

## Pre-processing

### WordVectorization with `CountVectorizer()`

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X)
X_set = vectorizer.transform(X)

In [3]:
print("Shape of X:", X_set.shape)
print("Shape of Y:", Y.shape)

Shape of X: (185659, 146321)
Shape of Y: (185659,)


## Building the baseline model

For testing purposes, we will use the Logistic Regression classifier `LogisticRegression()` with its default settings as our baseline model.

In [4]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=10000)

## Evaluating the Baseline Model

### 5-fold Stratified Cross-validation

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
accuracy_score_list, recall_score_list, precision_score_list, f1_score_list = [], [], [], []

for time, (train_index, test_index) in enumerate(skfolds.split(X_set, Y)):
    X_train, X_test = X_set[train_index], X_set[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)

    # Evaluate model
    AccuracyScore = accuracy_score(Y_test, y_pred)
    RecallScore = recall_score(Y_test, y_pred)
    PrecisionScore = precision_score(Y_test, y_pred)
    F1Score = f1_score(Y_test, y_pred)

    # Add to lists
    accuracy_score_list.append(AccuracyScore)
    recall_score_list.append(RecallScore)
    precision_score_list.append(PrecisionScore)
    f1_score_list.append(F1Score)

    # Print the matrix
    print('Time: ', time + 1)
    print('\taccuracy_score: {:.4f}'.format(AccuracyScore))
    print('\trecall_score: {:.4f}'.format(RecallScore))
    print('\tprecision_score: {:.4f}'.format(PrecisionScore))
    print('\tf1_score: {:.4f}'.format(F1Score))

Time:  1
	accuracy_score: 0.9350
	recall_score: 0.9152
	precision_score: 0.9531
	f1_score: 0.9338
Time:  2
	accuracy_score: 0.9316
	recall_score: 0.9120
	precision_score: 0.9492
	f1_score: 0.9303
Time:  3
	accuracy_score: 0.9322
	recall_score: 0.9121
	precision_score: 0.9505
	f1_score: 0.9309
Time:  4
	accuracy_score: 0.9333
	recall_score: 0.9133
	precision_score: 0.9515
	f1_score: 0.9320
Time:  5
	accuracy_score: 0.9327
	recall_score: 0.9144
	precision_score: 0.9492
	f1_score: 0.9315


In [6]:
import numpy as np

# Print the average value of each matrix
print("Accuracy: {:.2%}".format(np.average(accuracy_score_list)))
print("Recall: {:.2%}".format(np.average(recall_score_list)))
print("Precision: {:.2%}".format(np.average(precision_score_list)))
print("F1_score: {:.2%}".format(np.average(f1_score_list)))

Accuracy: 93.30%
Recall: 91.34%
Precision: 95.07%
F1_score: 93.17%


## Save the model

In [7]:
import pickle

clf.fit(X_set, Y)

print(clf.n_iter_)

with open('./model/LR_count.pickle', 'wb') as f:
    pickle.dump(clf, f)

[2253]


using code below to load the fitted model:

``` python
with open('./model/LR_count.pickle', 'rb') as f:
    clf = pickle.load(f)
```

In [8]:
vocabulary = vectorizer.vocabulary_

print(clf.coef_[:, vocabulary['happy']])
print(clf.coef_[:, vocabulary['sad']])

[-0.14041675]
[-0.19636139]


In [9]:
n_feats_to_show = 10

# Flip the index so that values are keys and keys are values:
keys = vectorizer.vocabulary_.values()
values = vectorizer.vocabulary_.keys()
vocab_inverted = dict(zip(keys, values))

for c, weights_c in enumerate(clf.coef_):
    print(f'\nWeights for class {c}:\n')
    strongest_idxs = np.argsort(weights_c)[-n_feats_to_show:]

    for idx in strongest_idxs:
        print(f'{vocab_inverted[idx]} with weight {weights_c[idx]}')


Weights for class 0:

suicidei with weight 3.3283044555491035
sw with weight 3.3344068213180083
upi with weight 3.354254375064318
lifei with weight 3.7136275275338586
myselfi with weight 3.7195201693589732
iti with weight 3.787535432196934
anymorei with weight 3.9350069893944677
diei with weight 4.134013213528239
helpi with weight 4.426595743027976
mei with weight 4.700628407770826
