<h1>Importing Libraries</h1>

In [1]:
"""
Code borrowed from: https://github.com/AminuIsrael/Predicting-Suicide-Ideation
Article link: https://towardsdatascience.com/building-a-suicidal-tweet-classifier-using-nlp-ff6ccd77e971
"""

import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('stopwords')

import torch

device = torch.device('cuda')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rezwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h1>Data Preprocessing</h1>

In [2]:
"""
The following function:
- removes any form of HTML markup
- keeps emoticon characters 
- removes non-word characters
- converts text to lowercase
"""
def preprocess_tweet(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text+' '.join(emoticons).replace('-', '') 
    return text

In [3]:
tqdm.pandas()
df = pd.read_csv('https://raw.githubusercontent.com/natasharavinand/Amity/main/data/model_inputs/all_data.csv')
df['tweet'] = df['tweet'].progress_apply(preprocess_tweet)

100%|██████████| 20446/20446 [00:00<00:00, 69694.41it/s]


In [4]:
"""
The tokenizer_porter() function does two things - tokenization and word stemming.
The PorterStemmer() function performs word stemming, i.e. convert words to their root words. e.g. running, outrun, runner, etc. to 'run'.
The split() method tokenizes the text into words.
"""

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [5]:
# Removing stop words like "at", "when", "the", etc.
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [6]:
# Testing out the user-defined tokenzier_porter() function
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [7]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

<h1>Using the Hashing Vectorizer</h1>

In [8]:
# Performing word embedding, i.e. converting tokens to vectors.
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenizer)

<h1>Building the Model</h1>

In [9]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1)

In [10]:
X = df["tweet"].to_list()
y = df['at_risk']

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=0)

In [12]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [13]:
classes = np.array([0, 1])
clf.partial_fit(X_train, y_train,classes=classes)

SGDClassifier(loss='log', random_state=1)

In [14]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.850


In [15]:
clf = clf.partial_fit(X_test, y_test)

<h1>Testing and making Predictions</h1>

In [16]:
label = {0:'neutral', 1:'at-risk'}
example = ["I'll kill myself am tired of living depressed and alone"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: at-risk
Probability: 99.82%


In [17]:
label = {0:'neutral', 1:'at-risk'}
example = ["It's such a hot day, I'd like to have ice cream and visit the park"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: neutral
Probability: 87.48%
