In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import sklearn.model_selection as model_selection

In [11]:
os.chdir('10_Data/20_Extracted Tweets/10_2018 Train')

FileNotFoundError: [WinError 3] The system cannot find the path specified: '10_Data/20_Extracted Tweets/10_2018 Train'

In [15]:
os.getcwd()

'C:\\Users\\Vishaal\\Documents\\GitHub\\TREC_Distributed_Machine_Learning\\TREC\\10_Data\\20_Extracted Tweets\\10_2018 Train'

In [13]:
'''
    Loading Earthquake and flood data from 2018 train. These are the only ones with critical tweets
'''
df1 = pd.read_csv('floods_TREC_2018_train.csv')
df2 = pd.read_csv('Earthquake_TREC_2018_train.csv')


In [16]:
os.chdir('../15_2018 Test')

In [17]:
'''
    Loading our test 2018 tweets from eathquakes and floods. These have a decent amount of critical tweets.
    We did not include attacks as a considerable amount of work has been done on that before.
'''
df3 = pd.read_csv('Earthquake_TREC_2018_test.csv')
df4 = pd.read_csv('Floods_TREC_2018_test.csv')

In [18]:
'''
    Combining all into one big data frame
'''
df_c = pd.DataFrame()
df_c['Tweet'] = pd.concat([df1['Tweet'] , df2['Tweet'], df3['Tweet'], df4['Tweet'] ])
df_c['Priority'] = pd.concat([df1['Priority'] , df2['Priority'], df3['Priority'], df4['Priority']])


In [19]:
'''
    52 Critical tweets out of 7432
'''
(df_c[df_c['Priority']=='Critical']).shape

(52, 2)

In [24]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
def to_categorical(df_c):
    t = []
    for element in df_c['Priority']:
        if element =='Critical':
            t.append(1)
        else:
            t.append(0)
        
    t = np.array(t)
    df_c['Target'] = t

    df_c['Target'] = df_c['Target'].astype('category')
    t = df_c['Target']
    del df_c['Target']
    return (t)

t = to_categorical(df_c)

In [25]:
'''
    Creating a function to input lemmatized text to possibly another function that outputs the tfidf in a csv format.
    We could also simply use the output from this funtion in an tfidf format (no csv) and train a model.
'''
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    token_array = []
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()
    stem = PorterStemmer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(''.join(element))
        
    return (lemmatized_array_join)

In [21]:
'''
    OPTIONAL - We use this to convert the tfidf into CSV format. 
    
    Output is a combined dataframe - use pd.to_csv to specify save location 
    
    NOTE - corpus is tfidf in non-array format
'''
def tfidf_to_CSV(df, corpus):
    tf=TfidfVectorizer()
    text_tf= tf.fit_transform(corpus)
    text_tf_dense = text_tf.todense()
    
    words = tf.get_feature_names()
    M = text_tf.tolil()
    l_features = []
    for i in range(M.shape[0]):
        l_features.append(np.array(M[i].todense())[0])
        
    df_features = pd.DataFrame(l_features)
    
    df_combined = pd.concat([df,df_features], axis=1)
    
    cols = list(df.columns) + words
    
    df_combined_1 = df_combined.rename(columns={x:y for x,y in zip(df_combined.columns,cols)})
    
    return (df_combined_1)

In [26]:
'''
    DTM to get TF-IDF features
'''
tf=TfidfVectorizer()
lemmatized_array_join = preProcess(df_c)
text_tf= tf.fit_transform(lemmatized_array_join)

In [27]:
'''
    Converting TF-IDF to list of lists. Then we play around with the datatypes to get features as a dense
    list of numpy arrays l_features. We also get the actual word names that are used as features.
    
    Use only if you need the csv output. NOT REQUIRED TO TRAIN MODEL
'''
words = tf.get_feature_names()
M = text_tf.tolil()
l_features = []
for i in range(M.shape[0]):
    l_features.append(np.array(M[i].todense())[0])

In [28]:
'''
    Converting DTM to array. REQUIRED TO DIRECTLY TRAIN SVM
'''
text_tf= tf.fit_transform(lemmatized_array_join).toarray()

In [29]:
'''
    * Train SVM
    * ‘optimal’: eta = 1.0 / (alpha * (t + t0)) where t0 is chosen by a heuristic proposed by Leon Bottou.
    * This model is trained directly using the tfidf in the matrix form.
'''
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(text_tf, t, test_size=0.2, random_state=100)
clf = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss = 'squared_loss', alpha = 0.0001, max_iter=10000000, tol=1e-3
                                                   , shuffle = True, learning_rate = 'optimal', penalty='l1'))
clf.fit(X_train, Y_train)

y_pred = clf.predict(X_val)

print('Recall Score = ', metrics.recall_score(Y_val, y_pred))

print('Precision Score = ', metrics.precision_score(Y_val, y_pred))


Recall Score =  0.4444444444444444
Precision Score =  0.0075046904315197


In [30]:
metrics.confusion_matrix(Y_val, y_pred)

array([[947, 529],
       [  5,   4]], dtype=int64)

In [39]:
'''
    Let us now balance the dataset and see what happens. We have 52 critical tweets so lets balance that with
    200 low priority tweets.
'''
df_low = df_c[df_c['Priority'] == 'Low'].sample(1000)
df_crit = df_c[df_c['Priority'] == 'Critical']
df_lc = pd.concat([df_low, df_crit])

In [40]:
'''
    Convert to categorical
'''
t_lc = to_categorical(df_lc)
'''
    Getting a DTM of tf-idf features
'''
tf=TfidfVectorizer()
lemmatized_array_join = preProcess(df_lc)
text_lc= tf.fit_transform(lemmatized_array_join)
'''
    Converting DTM to array. REQUIRED TO DIRECTLY TRAIN SVM
'''
text_lc= tf.fit_transform(lemmatized_array_join).toarray()

In [49]:
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(text_lc, t_lc, test_size=0.2, random_state=100)
clf = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss = 'squared_loss', alpha = 0.0001, max_iter=10000000, tol=1e-3
                                                   , shuffle = True, learning_rate = 'optimal', penalty='l1'))
clf.fit(X_train, Y_train)

y_pred = clf.predict(X_val)

print('Recall Score = ', metrics.recall_score(Y_val, y_pred))

print('Precision Score', metrics.precision_score(Y_val, y_pred))

Recall Score =  0.7692307692307693
Precision Score 0.08


In [155]:
metrics.confusion_matrix(Y_val, y_pred)







array([[32,  6],
       [11,  2]], dtype=int64)