In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import sklearn.model_selection as model_selection
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [None]:
os.getcwd()

In [3]:
os.chdir('10_Data/20_Extracted Tweets/10_2018 Train')

In [None]:
'''
    Loading Earthquake and flood data from 2018 train. These are the only ones with critical tweets
'''
df1 = pd.read_csv('floods_TREC_2018_train.csv')
df2 = pd.read_csv('Earthquake_TREC_2018_train.csv')

In [None]:
os.chdir('../15_2018 Test')

In [None]:
'''
    Loading our test 2018 tweets from eathquakes and floods. These have a decent amount of critical tweets.
    We did not include attacks as a considerable amount of work has been done on that before.
'''
df3 = pd.read_csv('Earthquake_TREC_2018_test.csv')
df4 = pd.read_csv('Floods_TREC_2018_test.csv')

In [None]:
'''
    Combining all into one big data frame
'''
df_c = pd.DataFrame()
df_c['Tweet'] = pd.concat([df1['Tweet'] , df2['Tweet'], df3['Tweet'], df4['Tweet'] ])
df_c['Priority'] = pd.concat([df1['Priority'] , df2['Priority'], df3['Priority'], df4['Priority']])

In [None]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
def to_categorical(df_c):
    t = []
    for element in df_c['Priority']:
        if element =='Critical':
            t.append(1)
        else:
            t.append(0)
        
    t = np.array(t)
    df_c['Target'] = t

    df_c['Target'] = df_c['Target'].astype('category')
    t = df_c['Target']
    del df_c['Target']
    return (t)

t = to_categorical(df_c)

In [None]:
'''
    Creating a function to input lemmatized text to possibly another function that outputs the tfidf in a csv format.
    We could also simply use the output from this funtion in an tfidf format (no csv) and train a model.
'''
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    token_array = []
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()
    stem = PorterStemmer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(''.join(element))
        
    return (lemmatized_array_join)

In [None]:
'''
    DTM to get TF-IDF features
'''
tf=TfidfVectorizer()
lemmatized_array_join = preProcess(df_c)
#array_of_words = []
#for tweet in lemmatized_array_join:
#    array_of_words.append(tweet.split(','))
text_tf= tf.fit_transform(lemmatized_array_join)

In [None]:
'''
    Converting DTM to array. REQUIRED TO DIRECTLY TRAIN SVM
'''
text_tf= tf.fit_transform(lemmatized_array_join).toarray()


In [None]:
'''
    A simple RNN using embedding layer and a SimpleRNN layer. 32 is the number of dimensions we wish to 
    embed into. Like 8 in the previous example we did for word embeddings.
'''
model = Sequential()
model.add(Embedding(23542, 4))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(text_tf, t, epochs=10, batch_size=128, validation_split=0.2)

In [None]:
'''
    Plotting accuracy VS epoch for training and validations
'''
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

In [None]:
'''
    Plotting loss VS epoch for training and validations
'''
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

In [1]:
t



NameError: name 't' is not defined