In [84]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn import feature_extraction, linear_model, model_selection, metrics
from sklearn import ensemble
from scipy import sparse
import sys
sys.path.append("..")
from src.make_model_lstm import hate_speech_model

In [85]:
#watch overfitting


In [86]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [87]:
df = pd.read_csv('../data/labeled_data.csv')

In [88]:
tweets = df['tweet'].apply(lambda x: x.lstrip('!'))

In [89]:
y = np.argmax(df[['hate_speech', 'offensive_language', 'neither']].values, axis = 1)

In [90]:
stop_words = set(stopwords.words('english')) 

In [91]:
token = nltk.tokenize.casual.TweetTokenizer()

In [92]:
lemmatizer = WordNetLemmatizer()

In [93]:
cleaned = []
wordDict = {}
i = 1
for tweet in tweets:
    tokenized = token.tokenize(tweet)
    newSent = []
    for word in tokenized:
        if word not in stop_words:
            newWord = lemmatizer.lemmatize(word)
            if newWord not in wordDict:
                wordDict[newWord] = i
                i += 1
            newSent.append(newWord)
            
    cleaned.append(newSent)

In [94]:
print(cleaned[0])
print(len(cleaned))

['RT', '@mayasolovely', ':', 'As', 'woman', 'complain', 'cleaning', 'house', '.', '&', 'man', 'always', 'take', 'trash', '...']
24783


In [95]:
maxWords = max(map(len, cleaned))

In [96]:
def create_seq(sent, vocab, maxWords):
    n = len(sent)
    numZeros = maxWords - n
    result = [0]*numZeros
    
    for word in sent:
        if word in vocab:
            result.append(vocab[word] + 2)
        else:
            result.append(1)
            
    return result

In [97]:
sequences = np.array(list(map(lambda x: create_seq(x, wordDict, maxWords), cleaned)))

In [98]:
sequences[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [99]:
y_cat = np.zeros((len(y), 3))
for i, l in enumerate(y):
    y_cat[i, l] = 1

In [100]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(sequences, y_cat, test_size = .1)

In [101]:
print(max(wordDict.values()))

41243


In [154]:
model = hate_speech_model()
model.build_model()
model.fit(X_train, y_train, epochs = 100, class_weight = {0: 1, 1: .05, 2: .15})
preds = model.predict(X_test)

>> Compiled...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 21188 samples, validate on 1116 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


In [155]:
print(metrics.roc_auc_score(y_test, preds))

0.8694688863729785


In [156]:
y_true = np.argmax(y_test, axis = 1)
y_pred = np.argmax(preds, axis = 1)

In [157]:
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.14      0.74      0.24       132
           1       0.95      0.70      0.80      1959
           2       0.71      0.61      0.66       388

    accuracy                           0.69      2479
   macro avg       0.60      0.68      0.57      2479
weighted avg       0.87      0.69      0.75      2479



In [158]:
print(metrics.confusion_matrix(y_true, y_pred))

[[  98   22   12]
 [ 504 1371   84]
 [  97   55  236]]
