In [151]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv('Attack.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Tweet,Event,Retweet_Count,Follower_Count,Source,User_Created_at,Tweet_Created_at,User_Language,User_Screen_Name,User_Location,Event_Decrption,Categories,Priority
0,11867,11867,665290631930060800,RT @cheerio15: Restaurant in Paris 10th distri...,parisAttacks2015,1,2854,"<a href=""http://twitter.com/download/iphone"" r...",2009-10-01 09:25:28,2015-11-13 22:10:21,en,SweetMissHope,"South East, England",The November 2015 Paris attacks were a series ...,"['ThirdPartyObservation', 'Factoid', 'NewSubEv...",Low
1,11868,11868,665308764333961217,public transport problems in paris due to five...,parisAttacks2015,1,1829,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",2014-09-23 02:52:35,2015-11-13 23:22:24,en-gb,hillsideheather,scotland,The November 2015 Paris attacks were a series ...,"['FirstPartyObservation', 'NewSubEvent']",Medium
2,11869,11869,665308229468073984,RT @AP: BREAKING: French police official confi...,parisAttacks2015,5175,2513,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",2010-09-16 23:54:23,2015-11-13 23:20:17,en,alexandrianeas,"New York, NY",The November 2015 Paris attacks were a series ...,"['ThirdPartyObservation', 'Factoid', 'News', '...",High
3,11870,11870,665365972631420929,At least 3 of the dead attackers at Bataclan t...,parisAttacks2015,0,46,"<a href=""https://about.twitter.com/products/tw...",2014-04-11 23:45:26,2015-11-14 03:09:44,en,arisha9087,,The November 2015 Paris attacks were a series ...,"['ThirdPartyObservation', 'News']",Medium
4,11871,11871,665284063901696002,RT @michaelh992: PT: According to Police Union...,parisAttacks2015,162,320,"<a href=""http://twitter.com/download/iphone"" r...",2009-10-29 15:10:02,2015-11-13 21:44:15,en,edwardkiernan,London,The November 2015 Paris attacks were a series ...,"['ThirdPartyObservation', 'EmergingThreats', '...",Critical


In [4]:
'''
    Converting tweet column to str
'''
df['Tweet'] = df['Tweet'].astype('str')

In [5]:
'''
    Generalise process to all files. Maybe later
'''
event_type = ['Floods', 'Earthquake', 'Bushfire', 'Bombings', 'Tornado', 'Attack', 'SchoolShooting', 'typhoon' ]

In [6]:
'''
    Tokenize each tweet into words. Note we haven't yet removed stop words
'''
token_array = []
for tweet in df['Tweet']:
    token_tweet = word_tokenize(tweet)
    token_array.append(token_tweet)
                       
token_array[1]

['public',
 'transport',
 'problems',
 'in',
 'paris',
 'due',
 'to',
 'five',
 'attacks',
 'in',
 'the',
 'city',
 'and',
 'just',
 'outside',
 'it',
 'with',
 'suicide',
 'bomber',
 'and',
 'at',
 'least',
 'two',
 'heavily',
 'armedmen']

In [9]:
'''
    Will remove stop words from tweet. We still have to look into removing punctuation marks.
'''
stop_words=set(stopwords.words("english"))
filtered_token_array=[]
for tweet in token_array:
    filtered_tweet = []
    for word in tweet:
        if word not in stop_words:
            filtered_tweet.append(word)
    filtered_token_array.append(filtered_tweet)
    
filtered_token_array[1]

['public',
 'transport',
 'problems',
 'paris',
 'due',
 'five',
 'attacks',
 'city',
 'outside',
 'suicide',
 'bomber',
 'least',
 'two',
 'heavily',
 'armedmen']

In [10]:
'''
    We will now do stemming. This is the process of removing different forms of the same word and will
    resort to the root word. For example, connection, connected, connecting word reduce to a common 
    word "connect".
'''
ps = PorterStemmer()
stemmed_array=[]
for tweet in filtered_token_array:
    stemmed_tweet = []
    for word in tweet:
        stemmed_tweet.append(ps.stem(word))
    stemmed_array.append(stemmed_tweet)
    
stemmed_array[1]

['public',
 'transport',
 'problem',
 'pari',
 'due',
 'five',
 'attack',
 'citi',
 'outsid',
 'suicid',
 'bomber',
 'least',
 'two',
 'heavili',
 'armedmen']

In [136]:
'''
    We now do lemmatization. This is like stemming but more effective apparently as it does a dictionary lookup. For 
    instance a relation between the words good and better may be made in lemmatisation but not in stemming.
    
    Lemmatization is much better from a cursory look. Words like earthquake are being cut down to earthquak 
    when using stemming. 
'''
lem = WordNetLemmatizer()
stem = PorterStemmer()

lemmatized_array=[]
for tweet in filtered_token_array:
    lemmatized_tweet = []
    for word in tweet:
        lemmatized_tweet.append(lem.lemmatize(word,'v'))
    lemmatized_array.append(lemmatized_tweet)

lemmatized_array_join = []
for element in lemmatized_array:
    lemmatized_array_join.append(''.join(element))

In [137]:
'''
    DTM to get bag of words
'''

cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1))
text_counts= cv.fit_transform(lemmatized_array_join)
text_counts_dense = text_counts.todense()

In [138]:
'''
    DTM to get TF-IDF features
'''

tf=TfidfVectorizer()
text_tf= tf.fit_transform(lemmatized_array_join)
text_tf_dense = text_tf.todense()

In [139]:
'''
    Converting TF-IDF to list of lists. Then we play around with the datatypes to get features as a dense
    list of numpy arrays l_features. We also get the actual word names that are used as features.
'''
words = tf.get_feature_names()
M = text_tf.tolil()
l_features = []
for i in range(M.shape[0]):
    l_features.append(np.array(M[i].todense())[0])

In [140]:
'''
    Converting to dataframe
'''
df_features = pd.DataFrame(l_features)

In [141]:
'''
    Combining both the dataframes - we now have the TF-IDF features and all the other stuff we had before
'''
df_combined = pd.concat([df,df_features], axis=1)

In [142]:
'''
    List of columns with original columns and word features
'''
cols = list(df.columns) + words

In [143]:
'''
    Rename columns in original dataframe with new column names
'''
df_combined = df_combined.rename(columns={x:y for x,y in zip(df_combined.columns,cols)})

In [149]:
'''
    Creating a function to the above to all the files...
'''
def preProcess(file):
    df = pd.read_csv(file+'.csv')
    df['Tweet'] = df['Tweet'].astype('str')
    
    token_array = []
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()
    stem = PorterStemmer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(''.join(element))
        
    return (lemmatized_array_join)
    
    
def tfidf(lemmatized_array_join):
    tf=TfidfVectorizer()
    text_tf= tf.fit_transform(lemmatized_array_join)
    text_tf_dense = text_tf.todense()
    
    words = tf.get_feature_names()
    M = text_tf.tolil()
    l_features = []
    for i in range(M.shape[0]):
        l_features.append(np.array(M[i].todense())[0])
        
    df_features = pd.DataFrame(l_features)
    
    df_combined = pd.concat([df,df_features], axis=1)
    
    cols = list(df.columns) + words
    
    df_combined = df_combined.rename(columns={x:y for x,y in zip(df_combined.columns,cols)})
    
    return (df_combined)

In [150]:
event_type = ['Floods', 'Earthquake', 'Bushfire', 'Bombings', 'Tornado', 'Attack', 'SchoolShooting', 'typhoon' ]

for disaster in event_type:
    feature = preProcess(disaster)
    df = tfidf(feature)
    df.to_csv(disaster+'_tfidf_features_2018_train.csv')
