## Importing Relevant Libraries

In [1]:
import json
import keras
import pandas as pd
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
import numpy as np
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Dropout, Activation, Embedding, Flatten
from tensorflow.keras.preprocessing.text import one_hot
from datetime import datetime
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix

import json
from keras.models import model_from_json

# Import libraries
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
# nltk.download()
import string
import re
%matplotlib inline
pd.set_option('display.max_colwidth', 100)

In [2]:
data = pd.read_csv('nlp-getting-started/train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


## Data Cleaning

In [3]:
data_after_null_removal = data.copy()
data_after_null_removal = data_after_null_removal.dropna(subset=['location'])
data_after_null_removal.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,5080.0,5080,5080,5080,5080.0
unique,,221,3341,5028,
top,,collision,USA,#Bestnaijamade: 16yr old PKK suicide bomber who detonated bomb in ... http://t.co/KSAwlYuX02 bes...,
freq,,36,104,6,
mean,5407.112598,,,,0.432283
std,3116.359041,,,,0.495442
min,48.0,,,,0.0
25%,2728.75,,,,0.0
50%,5360.5,,,,0.0
75%,8086.0,,,,1.0


In [4]:
# Location + texts
data_after_adding_location = data_after_null_removal.copy()

data_after_adding_location['tweets'] = data_after_adding_location['text'].str.cat(data_after_adding_location['location'],sep=" ")
data_after_adding_location

Unnamed: 0,id,keyword,location,text,target,tweets
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C,1,@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C Birmingham
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw,0,We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw Est. September 2012 - Bristol
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi,1,#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi AFRICA
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,"Crying out for more! Set me ablaze Philadelphia, PA"
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N,0,"On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N London, UK"
...,...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0txRHYs,0,On the bright side I wrecked http://t.co/uEa0txRHYs TN
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thought the wife who wrecked her cake was a goner mind ...,0,@widda16 ... He's gone. You can relax. I thought the wife who wrecked her cake was a goner mind ...
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty much all been wrecked hahaha shoutout to my family f...,0,Three days off from work and they've pretty much all been wrecked hahaha shoutout to my family f...
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/7enNulLKzM,0,#FX #forex #trading Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/7enNulLKzM Lo...


In [None]:
nltk.download('stopwords')

#punctutation removal
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

data_afer_punctuation_removal = data_after_adding_location.copy()
data_afer_punctuation_removal['tweets'] = data_afer_punctuation_removal['tweets'].apply(lambda x: remove_punct(x))
data_afer_punctuation_removal.head(10)

# Tockenization

def tokenization(text):
    text = re.split('\W+', text)
    return text

tockenized_data = data_afer_punctuation_removal.copy()
tockenized_data['tweets'] = tockenized_data['tweets'].apply(lambda x: tokenization(x.lower()))

# stopword removal 
data_after_stopword_removal = tockenized_data.copy()
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

data_after_stopword_removal['tweets'] = data_after_stopword_removal['tweets'].apply(lambda x: remove_stopwords(x))

# return to string
data_without_tockenization = data_after_stopword_removal.copy()
def listToString(s):     
    # initialize an empty string 
    str1 = " " 
    # return string   
    return (str1.join(s)) 
        
data_without_tockenization['tweets'] = data_without_tockenization['tweets'].apply(lambda x: listToString(x))
data_without_tockenization

In [None]:
### train - test split
training, testing = train_test_split(data_without_tockenization, test_size=0.1, random_state=42, shuffle=True)

In [None]:
train = training.copy()
test = testing.copy()
# get the dependent and independent variables
train_x = train['tweets']
train_y = train['target']
test_x = test['tweets']
test_y = test['target']

## Preprocessing

In [None]:
train_x = np.array(train_x)
train_y = np.array(train_y)


# only work with the 10000 most popular words found in our dataset
max_words = 30000

# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index

# Saving the dictionary
with open('./Dictionary_Models/locdictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

# Function to convert the text to its corresponding index
def convert_text_to_index_array(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

############################

# Embedding Representation

############################
# Padding each text so that it has the same length
text_length = 30
embedded_text = pad_sequences(allWordIndices, padding='pre', maxlen=text_length)

train_x = np.asarray(embedded_text)
train_y = keras.utils.to_categorical(train_y, 2)

## Model

In [None]:
# Creating a model
embedding_features = 32
model = Sequential()
model.add(Embedding(input_dim=max_words,output_dim=embedding_features, input_length=text_length))
model.add(Flatten())
model.add(Dense(30, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(30, activation='sigmoid'))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

print(model.summary())

history = model.fit(train_x, train_y,
    batch_size=32,
    epochs=10,
    verbose=1,
    validation_split=0.1,
    shuffle=True,
    callbacks=[EarlyStopping(monitor='val_loss', mode='min', verbose=1)],
)

model_json = model.to_json()
with open('./NN_Models/embed_keras_loc_model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('./NN_Models/embed_keras_loc_model.h5')

print('saved model!')

In [None]:
print(history.history.keys())
# summarize history for loss
plt.figure(figsize=(8,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
labels = ['fake', 'real']

# read in our saved dictionary
with open('dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])

    return wordIndices

# read in your saved model structure
json_file = open('embeddingModel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
# and create a model from that
model = model_from_json(loaded_model_json)
# and weight your nodes with your saved values
model.load_weights('embeddingModel.h5')

text_sequence = []

for index_of_interest, text1 in enumerate(test_x):
    # format your input for the neural net
    testArr = convert_text_to_index_array(text1)
    text_sequence.append(testArr)
    
padded_sequence = pad_sequences(text_sequence, padding='pre', maxlen=text_length)
predictions_before_formatting = model.predict(padded_sequence)

predictions = []
for num in predictions_before_formatting:
  predictions.append(np.argmax(num))
#end for

In [None]:
def rounding(results):
    '''Results needs to be rounded to 0 or 1 for fake or real, respectively'''
    if results < 0.5:
        return 0
    else:
        return 1
    
predictions_final = [rounding(x) for x in predictions]

In [None]:
predictions = np.array(predictions)
test_y = np.array(test_y)
confusion_matrix(test_y, predictions)

tn, fp, fn, tp = confusion_matrix(test_y, predictions).ravel()
precision = tp / (tp + fp)
accuracy = (tp + tn) / (tn + fp +fn +tp)
falsePositiveRate = fp / (fp + tn)
recall = tp/(tp+fn)
f1_score = 2 * ((recall*precision)/(recall+precision))

# Intialise data to Dicts of series. 
d = {'Value' : pd.Series([accuracy, precision, falsePositiveRate, recall,f1_score], index =['Accuracy', 'Precision', 'FPR','Recall','F1'])} 
  
# creates Dataframe. 
results = pd.DataFrame(d) 
  

data = {'Negative':[tn, fn], 'Positive':[fp, tp]} 
  
# Creates pandas DataFrame. 
df = pd.DataFrame(data, index =['Negative', 'Positive']) 
df

In [None]:
results

## Summary
- After running the model few times, it was observed that the inclusion of the location features does affect the overall performance of the model. 
- Hence, it is better to not include it at all.