In [1]:
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luofan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

I then wrote a function to clean the text data to remove any form of HTML markup, keep emoticon characters, remove non-word character and lastly convert to lowercase.

In [3]:
def preprocess_tweet(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    lowercase_text = re.sub('[\W]+', ' ', text.lower())
    text = lowercase_text+' '.join(emoticons).replace('-', '') 
    return text

After that, I applied the preprocess_tweet function to the tweet dataset to clean the data.

In [None]:
tqdm.pandas()
df = pd.read_csv('suicide_datascientist.csv')
df['tweet'] = df['tweet'].progress_apply(preprocess_tweet)

Then I converted the text to tokens by using the .split() method and used word stemming to convert the text to their root form.

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

Then I imported the stopwords library to remove stop words in the text.

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

Testing the function on a single text.

In [None]:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

# Vectorizer


For this project, I used the Hashing Vectorizer because it data-independent, which means that it is very low memory scalable to large datasets and it doesn’t store vocabulary dictionary in memory. I then created a tokenizer function for the Hashing Vectorizer

In [None]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

Then I created the Hashing Vectorizer object.

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenizer)

# Model

For the Model, I used the stochastic gradient descent classifier algorithm.

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1)

X = df["tweet"].to_list()
y = df['label']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=0)

Then I transformed the text data to vectors with the Hashing Vectorizer we created earlier:

In [None]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [None]:
classes = np.array([0, 1])
clf.partial_fit(X_train, y_train,classes=classes)

In [None]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

I had an accuracy of 91% which is fair enough, after that, I then updated the model with the prediction

In [None]:
clf = clf.partial_fit(X_test, y_test)

# Testing and Making Predictions

I added the text “I’ll kill myself am tired of living depressed and alone” to the model

In [None]:
label = {0:'negative', 1:'positive'}
example = ["I'll kill myself am tired of living depressed and alone"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

And when I used the following text “It’s such a hot day, I’d like to have ice cream and visit the park”, I got the following prediction:

In [None]:
label = {0:'negative', 1:'positive'}
example = ["It's such a hot day, I'd like to have ice cream and visit the park"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

The model was able to predict accurately for both cases. And that's how you build a simple suicidal tweet classifier.