# Developing the DOHA Model Convolutional Neural Network 

#### Please make sure you have followed the instructions from https://github.com/Starstorm/Dissertation before attempting to run any cells

## Install Modules

In [1]:
!pip install pandas nltk numpy sklearn keras gensim seaborn matplotlib



## Import Modules

In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import load_model
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Embedding

Using TensorFlow backend.


## If pre-trained model is present, load pre-trained model

In [3]:
model = load_model("3_3_2019_acc_9751.hdf5")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


## Load DataFrame, set global variables

In [4]:
df = pd.read_csv("clean_3_clearance_data.csv")

df.head(5)

TP = 0
FP = 0
TN = 0
FN = 0

## Split into training and testing data, create tokenizer

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(df['FULL_TEXT'], df['Decision'], test_size=0.2, random_state=42)
NUM_WORDS=20000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(X_train)

## Convert output to binary values

In [6]:
choices = df['Decision'].unique()
dic={}
for i,choice in enumerate(choices):
    dic[choice]=i
Y_train_binary=Y_train.apply(lambda x:dic[x])
Y_test_binary=Y_test.apply(lambda x:dic[x])

## (OPTIONAL) Define and Apply check_gender function, create two new DataFrames

In [None]:
def check_gender(text):
    global bad_count
    tokenized_words = word_tokenize(text)
    male_references = tokenized_words.count("he") + tokenized_words.count("He") + tokenized_words.count("him") + tokenized_words.count("Him")
    female_references = tokenized_words.count("she") + tokenized_words.count("She") + tokenized_words.count("her") + tokenized_words.count("Her")
    # While imperfect, research showed that these rules accurately determined gender and minimized throw-away cases
    if (male_references >= 3 or female_references >= 3) and ((male_references == 0 or female_references == 0) or (((male_references/female_references) > 1.5) or ((female_references/male_references) > 1.5))):
        if male_references > female_references:
            return 1
        elif female_references > male_references:
            return 0
    else:
        return -1

df['is_male'] = df['FULL_TEXT'].apply(check_gender)

df_male = df[df['is_male'] == 1]
df_female = df[df['is_male'] == 0]

In [None]:
for df_current in [df_male,df_female]:
    X_train_current, X_test_current, Y_train_current, Y_test_current = train_test_split(df_current['FULL_TEXT'], df_current['Decision'], test_size=0.2, random_state=42)
    choices = df['Decision'].unique()
    dic={}
    for i,choice in enumerate(choices):
        dic[choice]=i
    Y_train_current_binary=Y_train_current.apply(lambda x:dic[x])
    Y_test_current_binary=Y_test_current.apply(lambda x:dic[x])
    
    sequences_train = tokenizer.texts_to_sequences(X_train_current)
    sequences_valid = tokenizer.texts_to_sequences(X_test_current)
    word_index = tokenizer.word_index
    
    print('Found %s unique tokens.' % len(word_index))
    X_train_current = pad_sequences(sequences_train)
    X_val_current = pad_sequences(sequences_valid,maxlen=X_train_current.shape[1])
    y_train_current = to_categorical(np.asarray(Y_train_current_binary))
    y_val_current = to_categorical(np.asarray(Y_test_current_binary))
    
    sequences_test=tokenizer.texts_to_sequences(X_test_current)
    X_test_current = pad_sequences(sequences_test, maxlen=5514)
    y_pred_current = model.predict(X_test_current)
    
    calc_stats(Y_test_current, y_pred_current)
    
    print(TP)
    print(FP)
    print(TN)
    print(FN)

    accuracy = (TP + TN)/(TP + FP + TN + FN)
    recall = TP/(TP + FN)
    precision = TP/(TP + FP)
    f1_score = 2 * (recall * precision)/(recall + precision)

    print(accuracy)
    print(recall)
    print(precision)
    print(f1_score)

### The following code was originally from https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights and has been modified to fit this use case

#### Format and Shape Data

In [7]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_valid=tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 26778 unique tokens.


In [8]:
X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
y_train = to_categorical(np.asarray(Y_train_binary))
y_val = to_categorical(np.asarray(Y_test_binary))
print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

Shape of X train and X validation tensor: (4004, 5514) (1002, 5514)
Shape of label train and validation tensor: (4004, 2) (1002, 2)


## Train New Model (only run these cells if you did NOT load the pretrained model)

#### Load Word2Vec model for word embeddings

In [None]:
# Change the filepath below to where your Word2Vec model is stored
word_vectors = KeyedVectors.load_word2vec_format(r'D:\GoogleNews-vectors-negative300.bin', binary=True)

EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

#### Define Convolutional Neural Network Model

In [None]:
sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)

#### Print Summary of Model

In [None]:
model.summary()

#### Compile Model

In [None]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]

#### Train model

In [None]:
model.fit(X_train, y_train, batch_size=64, epochs=30, verbose=1, validation_data=(X_val, y_val),
         callbacks=callbacks)

#### Save model

In [None]:
model.save("your_new_model.hdf5")

## Whether you just trained a model or you used the pre-trained model, continue here

### Make predictions from model

In [9]:
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
y_pred = model.predict(X_test)

### Calculate Statistics from Test Data

In [10]:
def calc_stats(Y_test, y_pred):
    global TP, FP, TN, FN
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    Y_test_list = Y_test.tolist()

    for idx, elem in enumerate(y_pred):
        if Y_test_list[idx] == "Clearance denied":
            answer = False
        elif Y_test_list[idx] == "Clearance granted":
            answer = True
        else:
            print(Y_test[idx])
            assert False
        if elem[0] >= 0.5:
            if answer == False:
                TN += 1
            elif answer == True:
                FN += 1
        elif elem[0] < 0.5:
            if answer == False:
                FP += 1
            elif answer == True:
                TP += 1
                
calc_stats(Y_test, y_pred)

### Display All Statistics

In [11]:
print("True Positive: {}".format(TP))
print("False Positive: {}".format(FP))
print("True Negative: {}".format(TN))
print("False Negative: {}".format(FN))

accuracy = (TP + TN)/(TP + FP + TN + FN)
recall = TP/(TP + FN)
precision = TP/(TP + FP)
f1_score = 2 * (recall * precision)/(recall + precision)

print("Accuracy: {}".format(accuracy))
print("Recall: {}".format(recall))
print("Precision: {}".format(precision))
print("F1 Score: {}".format(f1_score))

informedness = recall - (FP/(FP + TN))
markedness = precision - (FN/(TN + FN))

print("Informedness: {}".format(informedness))
print("Markedness: {}".format(markedness))

True Positive: 315
False Positive: 11
True Negative: 662
False Negative: 14
Accuracy: 0.9750499001996008
Recall: 0.9574468085106383
Precision: 0.9662576687116564
F1 Score: 0.9618320610687022
Informedness: 0.941102083399197
Markedness: 0.9455476095400588


### Prepare other ML models

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df['FULL_TEXT']).toarray()
labels = df['Decision']
features.shape

(5006, 73380)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

models = [
    RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0)]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
        
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

### Produce Visualization of Model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()