In [210]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC 
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [211]:
df = pd.read_csv('Attack_TREC_2018_train.csv')
df.shape

(2066, 16)

In [212]:
'''
    Converting tweet column to str
'''
df['Tweet'] = df['Tweet'].astype('str')

In [124]:
'''
    Generalise process to all files. Maybe later
'''
event_type = ['Floods', 'Earthquake', 'Bushfire', 'Bombings', 'Tornado', 'Attack', 'SchoolShooting', 'typhoon' ]

In [213]:
'''
    Tokenize each tweet into words. Note we haven't yet removed stop words
'''
token_array = []
for tweet in df['Tweet']:
    token_tweet = word_tokenize(tweet)
    token_array.append(token_tweet)
                       
token_array[1]

['public',
 'transport',
 'problems',
 'in',
 'paris',
 'due',
 'to',
 'five',
 'attacks',
 'in',
 'the',
 'city',
 'and',
 'just',
 'outside',
 'it',
 'with',
 'suicide',
 'bomber',
 'and',
 'at',
 'least',
 'two',
 'heavily',
 'armedmen']

In [214]:
'''
    Will remove stop words from tweet. We still have to look into removing punctuation marks.
'''
stop_words=set(stopwords.words("english"))
filtered_token_array=[]
for tweet in token_array:
    filtered_tweet = []
    for word in tweet:
        if word not in stop_words:
            filtered_tweet.append(word)
    filtered_token_array.append(filtered_tweet)
    
filtered_token_array[1]

['public',
 'transport',
 'problems',
 'paris',
 'due',
 'five',
 'attacks',
 'city',
 'outside',
 'suicide',
 'bomber',
 'least',
 'two',
 'heavily',
 'armedmen']

In [215]:
'''
    We will now do stemming. This is the process of removing different forms of the same word and will
    resort to the root word. For example, connection, connected, connecting word reduce to a common 
    word "connect".
'''
ps = PorterStemmer()
stemmed_array=[]
for tweet in filtered_token_array:
    stemmed_tweet = []
    for word in tweet:
        stemmed_tweet.append(ps.stem(word))
    stemmed_array.append(stemmed_tweet)
    
stemmed_array[1]

['public',
 'transport',
 'problem',
 'pari',
 'due',
 'five',
 'attack',
 'citi',
 'outsid',
 'suicid',
 'bomber',
 'least',
 'two',
 'heavili',
 'armedmen']

In [216]:
'''
    We now do lemmatization. This is like stemming but more effective apparently as it does a dictionary lookup. For 
    instance a relation between the words good and better may be made in lemmatisation but not in stemming.
    
    Lemmatization is much better from a cursory look. Words like earthquake are being cut down to earthquak 
    when using stemming. 
'''
lem = WordNetLemmatizer()
stem = PorterStemmer()

lemmatized_array=[]
for tweet in filtered_token_array:
    lemmatized_tweet = []
    for word in tweet:
        lemmatized_tweet.append(lem.lemmatize(word,'v'))
    lemmatized_array.append(lemmatized_tweet)

lemmatized_array_join = []
for element in lemmatized_array:
    lemmatized_array_join.append(''.join(element))

In [217]:
'''
    DTM to get bag of words
'''

cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1))
text_counts= cv.fit_transform(lemmatized_array_join)
text_counts_dense = text_counts.todense()

In [218]:
'''
    DTM to get TF-IDF features
'''

tf=TfidfVectorizer()
text_tf= tf.fit_transform(lemmatized_array_join)

In [219]:
'''
    Converting TF-IDF to list of lists. Then we play around with the datatypes to get features as a dense
    list of numpy arrays l_features. We also get the actual word names that are used as features.
'''
words = tf.get_feature_names()
M = text_tf.tolil()
l_features = []
for i in range(M.shape[0]):
    l_features.append(np.array(M[i].todense())[0])

In [220]:
text_tf= tf.fit_transform(lemmatized_array_join).toarray()

In [221]:
'''
    Converting to dataframe
'''
df_features = pd.DataFrame(l_features)

In [222]:
'''
    Combining both the dataframes - we now have the TF-IDF features and all the other stuff we had before
'''
df_combined = pd.concat([df,df_features], axis=1)

In [223]:
df_combined.shape

(2066, 7202)

In [224]:
'''
    List of columns with original columns and word features
'''
cols = list(df.columns) + words

In [225]:
'''
    Rename columns in original dataframe with new column names
'''
df_combined = df_combined.rename(columns={x:y for x,y in zip(df_combined.columns,cols)})
df_combined.shape

(2066, 7202)

In [176]:
'''
    Creating a function to the above to all the files...
'''
def preProcess(file):
    df = pd.read_csv(file+'_TREC_2018_train.csv')
    df['Tweet'] = df['Tweet'].astype('str')
    
    token_array = []
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()
    stem = PorterStemmer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(''.join(element))
        
    return (lemmatized_array_join)
    
    
def tfidf_to_CSV(file, corpus):
    df = pd.read_csv(file+'_TREC_2018_train.csv')
    tf=TfidfVectorizer()
    text_tf= tf.fit_transform(corpus)
    text_tf_dense = text_tf.todense()
    
    words = tf.get_feature_names()
    M = text_tf.tolil()
    l_features = []
    for i in range(M.shape[0]):
        l_features.append(np.array(M[i].todense())[0])
        
    df_features = pd.DataFrame(l_features)
    
    df_combined = pd.concat([df,df_features], axis=1)
    
    cols = list(df.columns) + words
    
    df_combined_1 = df_combined.rename(columns={x:y for x,y in zip(df_combined.columns,cols)})
    
    return (df_combined_1)

In [107]:
'''event_type = ['Floods', 'Earthquake', 'Bushfire', 'Bombings', 'Tornado', 'Attack', 'SchoolShooting', 'typhoon' ]

for disaster in event_type:
    feature = preProcess(disaster)
    df = tfidf(disaster, feature)
    df.to_csv(disaster+'_tfidf_features_2018_train.csv')

    print(df.shape)'''

(2283, 8315)
(8252, 19169)
(677, 2505)
(535, 1937)
(152, 478)
(2066, 7202)
(1118, 4457)
(4757, 13089)
