## Imports all in one place

In [98]:
import csv
import nltk
import warnings
import numpy as np
from numpy.random import seed
from gensim.models import Word2Vec
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Conv2D, Flatten
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

### Ignoring warnings

In [99]:
warnings.filterwarnings('ignore')

### FUNCTION: Load LSTM Model

In [100]:
def get_lstm_model():
    json_file = open('./models/lstm/lstm.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    model.load_weights('./models/lstm/lstm.h5') 
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    return model

### FUNCTION: Load CNN Model

In [101]:
def get_cnn_model():
    json_file = open('./models/cnn/cnn.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    model.load_weights('./models/cnn/cnn.h5') 
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    return model

### FUNCTION: Load Word2Vec Model

In [102]:
def get_word2vec_model():
    return Word2Vec.load('./models/word2vec/word2vec.model')

### FUNCTION: Train Word2Vec Model

In [103]:
def train_word2vec_model(path):
    wordVecDataset = []
    training_file = open(path, encoding = 'utf-8', errors = 'ignore')
    csv_reader = csv.reader(training_file, delimiter = ',')
    for row in csv_reader:
        tokens = nltk.word_tokenize(row[1])
        wordVecDataset.append(tokens)
    model = Word2Vec(wordVecDataset, min_count = 1, size = 50, workers = 3, window = 3, sg = 1)
    model.save('./models/word2vec/word2vec.model')
    return model

### FUNCTION: Train Bi-LSTM Model

In [104]:
def train_lstm_model(x_train, y_train, x_test, y_test, epochs):
    model = Sequential()
    model.add(Bidirectional(LSTM(128, input_shape = (128, 1))))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    model.fit(x_train, y_train, validation_data = (x_test, y_test), batch_size = 200, epochs = epochs, shuffle = True)
    model_json = model.to_json()
    with open('./models/lstm/lstm.json', 'w') as json_file:
        json_file.write(model_json)
    model.save_weights('./models/lstm/lstm.h5')
    return model

### FUNCTION: Dataset for Bi-LSTM Model

In [105]:
def get_lstm_data(path):
    wordvecModel = get_word2vec_model()
    dataset = []
    X, Y = [], []
    file = open(path, encoding = 'utf-8', errors = 'ignore')
    csv_reader = csv.reader(file, delimiter = ',')
    for row in csv_reader:
        tokens = nltk.word_tokenize(row[1])
        dataset.append([tokens, row[2]])
    for tweet in dataset:
        tweet_tokens = tweet[0]
        embeddings = []
        for token in tweet_tokens:
            embeddings.append([round(abs(sum(wordvecModel[token])) * 10, 4)])
        padding = [[0]] * (128 - len(embeddings))
        embeddings = embeddings.copy() + padding
        X.append(embeddings)
        label = int(tweet[1])
        if label == 0:
            Y.append([0, 1])
        else:
            Y.append([1, 0])
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

### FUNCTION: Train CNN Model

In [106]:
def train_cnn_model(x_train, y_train, x_test, y_test, epochs):
    cnnModel = Sequential()
    cnnModel.add(Conv2D(128, kernel_size = 1, activation = 'relu', input_shape = (1,1,128)))
    cnnModel.add(Flatten())
    cnnModel.add(Dense(2, activation = 'softmax'))
    cnnModel.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    cnnModel.fit(x_train, y_train, validation_data = (x_test, y_test), batch_size = 200, epochs = epochs, shuffle = True)
    model_json = cnnModel.to_json()
    with open('./models/cnn/cnn.json', 'w') as json_file:
        json_file.write(model_json)
    cnnModel.save_weights('./models/cnn/cnn.h5')
    return cnnModel

### FUNCTION: Dataset for CNN Model

In [107]:
def get_cnn_data(path):
    wordvecModel = get_word2vec_model()
    dataset = []
    X, Y = [], []
    file = open(path, encoding = 'utf-8', errors = 'ignore')
    csv_reader = csv.reader(file, delimiter = ',')
    for row in csv_reader:
        tokens = nltk.word_tokenize(row[1])
        dataset.append([tokens, row[2]])
    for tweet in dataset:
        tweet_tokens = tweet[0]
        embeddings = []
        for token in tweet_tokens:
            embeddings.append(round(abs(sum(wordvecModel[token])) * 10, 4))
        padding = [0] * (128 - len(embeddings))
        embeddings = embeddings.copy() + padding
        X.append([[embeddings]])
        label = int(tweet[1])
        if label == 0:
            Y.append([0, 1])
        else:
            Y.append([1, 0])
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

### FUNCTION: Test model

In [108]:
def test_model(model, x_test, y_test):
    pred_output = model.predict(x_test)
    matrix = [[0, 0], [0, 0]]
    actual_yes, actual_no, predicted_yes = 0, 0, 0
    for i in range(len(y_test)):
        if y_test[i][0] == 1:
            actual_yes += 1
        elif y_test[i][0] == 0:
            actual_no += 1
        if pred_output[i][0] > 0.5:
            predicted_yes += 1
        x, y = 0, 0
        if y_test[i][0] > 0.5:
            x = 1
        else:
            x = 0
        if pred_output[i][0] > 0.5:
            y = 1
        else:
            y = 0
        matrix[x][y] += 1
    TP = matrix[1][1]
    TN = matrix[0][0]
    FP = matrix[0][1]
    FN = matrix[1][0]
    total = len(y_test)
    accuracy = (TP + TN) / total
    misclassfication = (FP + FN) / total
    recall = TP / actual_yes
    specificity = TN / actual_no
    precision = TP / predicted_yes
    f_score = 2 * ((recall * precision) / (recall + precision))
    print("Confusion Matrix:", matrix)
    print("Accuracy: ", accuracy)
    print("Misclassfication Rate: ", misclassfication)
    print("True Positive Rate (Recall): ", recall)
    print("True Negative Rate (Specificity): ", specificity)
    print("Precision: ", precision)
    print("F Score: ", f_score)

### Train Word2Vec Model

In [109]:
train_word2vec_model('./corpora/complete.csv')

<gensim.models.word2vec.Word2Vec at 0x1fce087ac50>

### Training CNN Model

In [110]:
x_train, y_train = get_cnn_data('./corpora/training.csv')
x_test, y_test = get_cnn_data('./corpora/testing.csv')
cnnModel = train_cnn_model(x_train, y_train, x_test, y_test, 150)
test_model(cnnModel, x_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Confusion Matrix: [[1, 0], [0, 1]]
Accuracy:  1.0
Misclassfication Rate:  0.0
True Positive Rate (Recall):  1.0
True Negative Rate (Specificity):  1.0
Precision:  1.0
F Score:  1.0


### Train LSTM Model

In [111]:
x_train, y_train = get_lstm_data('./corpora/training.csv')
x_test, y_test = get_lstm_data('./corpora/testing.csv')
lstmModel = train_lstm_model(x_train, y_train, x_test, y_test, 150)
test_model(lstmModel, x_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Confusion Matrix: [[1, 0], [0, 1]]
Accuracy:  1.0
Misclassfication Rate:  0.0
True Positive Rate (Recall):  1.0
True Negative Rate (Specificity):  1.0
Precision:  1.0
F Score:  1.0
