In [1]:
import csv

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [9]:
with open('data\\CleanTwitter.csv', 'r', newline='') as clean:
    read = csv.reader(clean, delimiter=',')
    
    stop_words = set(stopwords.words('english'))
    extended_stops = {"ya", "yo", "yu", "da", "em", "im", "theres", "dat", "dats", "aint", "thats", "doe", "ur"}
    negatives = ["no", "none", "not"]
    
    with open('data\\TwitterPreprocessed.csv', 'w', newline='') as prep:
        write = csv.writer(prep)
        
        prev = -1
        progress = 0
        
        for idx,i in enumerate(read):

            word_tokens = nltk.word_tokenize(i[1])
            filtered_sentence = []
            
            for w in word_tokens:
                if w not in stop_words and w not in extended_stops and len(w) > 1 or w in negatives:
                    filtered_sentence.append(w)
            
            filtered_sentence = [j[0] for j in groupby(filtered_sentence)]
            filtered_sentence = TreebankWordDetokenizer().detokenize(filtered_sentence)
            
            if filtered_sentence:
                write.writerow((i[0], filtered_sentence))

In [10]:
with open('data\\corpus.csv', 'r') as corp:
    read = csv.reader(corp)
    
    stop_words = set(stopwords.words('english'))
    extended_stops = {"ya", "yo", "yu", "da", "em", "im", "theres", "dat", "dats", "aint", "thats", "doe", "ur"}
    negatives = ["no", "none", "not"]
    
    df = []
    
    for idx,i in enumerate(read):
        
        tokens = nltk.word_tokenize(i[1])
        filtered = []
        
        for w in tokens:
            if w not in stop_words and w not in extended_stops and len(w) > 1 or w in negatives:
                filtered.append(w)
        df.append(filtered)
        
    
    with open('data\\allTokens.csv', 'w', newline='') as tok:
        write = csv.writer(tok)
        
        for i in df:
            write.writerow((i))

In [7]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath

In [8]:
with open('data\\allTokens.csv', 'r') as wordsList:
    read = csv.reader(wordsList)
    
    words = []
    
    for i in read:
        words.append(i)
    vector_model = Word2Vec(sentences=words, size=300, window=5, min_count=5, workers=4, sg=0)

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import *

import numpy as np

In [3]:
with open('data/TwitterPreprocessed.csv', 'r') as dataList:
    read = csv.reader(dataList)
    
    hate = 0
    noHate = 0
    
    for i in read:
        if i[0] == '1':
            hate += 1
        else:
            noHate += 1

with open('data/TwitterPreprocessed.csv', 'r') as dataList:
    read = csv.reader(dataList)
    tokenizer = Tokenizer(num_words=12000, oov_token="<UNK>")
    
    maxlen = 128
    all_data = []
    
    x_train_text = []
    x_train = []
    
    y_train = np.array([])
    y_train = y_train.astype('int64') 
    
    x_test_text = []
    x_test = []
    
    y_test = np.array([])
    y_test = y_test.astype('int64') 
    
    iter1 = 0
    iter2 = 0
    
    for i,j in read:
        if i == '1':
            iter1 += 1
            
            if iter1 <= round(hate*0.8):
                x_train_text.append(j)
                y_train = np.append(y_train, int(i))
            else:
                x_test_text.append(j)
                y_test = np.append(y_test, int(i))
        
        elif i == '0':
            iter2 += 1
            
            if iter2 <= round(noHate*0.8):
                x_train_text.append(j)
                y_train = np.append(y_train, int(i))
            else:
                x_test_text.append(j)
                y_test = np.append(y_test, int(i))
        
        all_data.append(j)
    
    tokenizer.fit_on_texts(all_data)
    word_index = tokenizer.word_index
    
    train_sequences = tokenizer.texts_to_sequences(x_train_text)
    x_train = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=maxlen)
    
    test_sequences = tokenizer.texts_to_sequences(x_test_text)
    x_test = pad_sequences(test_sequences, padding='post', truncating='post', maxlen=maxlen)

In [34]:
import pickle

In [38]:
pickle.dump(word_index, open("pickles\\word_index.pickle", 'wb'))

In [4]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Activation, Dropout, Bidirectional, Input, Concatenate, Reshape
from tensorflow.keras.layers import LSTM, Conv1D
from tensorflow.keras.layers import GlobalMaxPooling1D, MaxPooling1D, GlobalAveragePooling1D
from keras import regularizers

import tensorflow as tf
import keras

Using TensorFlow backend.


In [5]:
def initWeights():
    zero_vec = np.array([])
    zero_vec.astype('float32')

    for i in vector_model["random"]:
        zero_vec = np.append(zero_vec, 0)

    embedding_weights = np.zeros((max_features, dim_count))
    for word,index in word_index.items():
        try:
            embedding_weights[index, :] = vector_model[word]
        except:
            embedding_weights[index, :] = zero_vec
            
    return embedding_weights

In [47]:
dim_count = 300

max_features = len(word_index) + 1
embedding_weights = initWeights()
vocab_len = len(embedding_weights)

embedding_input = Input(shape=(128,), name='embedding_input')

embedding = Embedding(max_features, dim_count, trainable = False, weights = [embedding_weights])(embedding_input)
bilstm = Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4, return_sequences=True,
                            activity_regularizer=regularizers.l1(0.01), kernel_regularizer=regularizers.l2(0.01)))(embedding)
lstm = LSTM(128, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)(bilstm)
maxPool = GlobalMaxPooling1D()(lstm)
averagePool = GlobalAveragePooling1D()(lstm)
concat = Concatenate()([maxPool, averagePool])
dropout = Dropout(0.2)(concat)
dense = Dense(128, activation='relu')(dropout)
dense = Dense(64, activation='relu')(dense)
out = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=embedding_input, outputs=out)

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
model.fit(x_train, y_train, batch_size=32, epochs=20, validation_data=(x_test, y_test), shuffle=True)
model.evaluate(x_test, y_test, batch_size=32, verbose = 2)

  """
  # This is added back by InteractiveShellApp.init_path()


Train on 7141 samples, validate on 1785 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
1785/1785 - 6s - loss: 0.3056 - accuracy: 0.9053 - precision_2: 0.9045 - recall_2: 0.8780


[0.305625716419447, 0.90532213, 0.90446866, 0.8779811]

In [49]:
model.save("NN_models\\BiLSTM-LSTM_Vinf.h5")

In [10]:
dim_count = 300

max_features = len(word_index) + 1
embedding_weights = initWeights()
vocab_len = len(embedding_weights)

model = Sequential()

model.add(Embedding(max_features, dim_count, trainable = False, weights = [embedding_weights]))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True,
               activity_regularizer=regularizers.l1(0.001), kernel_regularizer=regularizers.l2(0.001)))
model.add(Conv1D(int(128 / 2), 3, kernel_initializer='he_normal', activation='relu', padding='valid',
                activity_regularizer=regularizers.l1(0.001), kernel_regularizer=regularizers.l2(0.001)))
model.add(Conv1D(int(128 / 4), 3, kernel_initializer='he_normal', activation='relu', padding='valid',
                activity_regularizer=regularizers.l1(0.001), kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.4))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test), shuffle=True)
model.evaluate(x_test, y_test, batch_size=32, verbose = 2)

  """
  # This is added back by InteractiveShellApp.init_path()


Train on 7141 samples, validate on 1785 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1785/1785 - 3s - loss: 0.3061 - accuracy: 0.9188 - precision_2: 0.9337 - recall_2: 0.8968


[0.30614780904699107, 0.9187675, 0.93371445, 0.89681536]