In [92]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.sparse import csr_matrix

In [30]:
df = pd.read_csv('Earthquake.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Tweet,Event,Retweet_Count,Follower_Count,Source,User_Created_at,Tweet_Created_at,User_Language,User_Screen_Name,User_Location,Event_Decrption,Categories,Priority
0,1934,1934,451308164932440064,#ChileEarthquake Update: Chilean Interior Mini...,chileEarthquake2014,1,115158,"<a href=""http://twitter.com/#!/download/ipad"" ...",2008-11-14 14:14:03,2014-04-02 10:40:26,en,WLTX,"Columbia, SC",The 2014 Iquique earthquake struck off the coa...,"['ThirdPartyObservation', 'MultimediaShare', '...",Medium
1,1935,1935,451293013763817472,Powerful earthquake strikes off the coast of C...,chileEarthquake2014,0,543,"<a href=""http://twitterfeed.com"" rel=""nofollow...",2014-01-21 12:26:37,2014-04-02 09:40:13,es,PiQkete_Online,à¸£Î±Ð¸Ñ‚Ïƒ DÏƒÐ¼iÐ¸gÏƒ Ñ”à¸£Ñ‚Ñ”,The 2014 Iquique earthquake struck off the coa...,['OriginalEvent'],Low
2,1936,1936,451285666350239744,#PrayForChile God is with you.,chileEarthquake2014,0,715,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",2011-08-16 01:50:33,2014-04-02 09:11:02,en,whatismylifex,,The 2014 Iquique earthquake struck off the coa...,['Sentiment'],Low
3,1937,1937,451300464269983744,Five dead after 8.2 magnitude earthquake off c...,chileEarthquake2014,1,3809,"<a href=""http://twitter.com/download/android"" ...",2011-02-11 22:02:54,2014-04-02 10:09:50,en,ShelleyBFox8,New Orleans,The 2014 Iquique earthquake struck off the coa...,"['Factoid', 'News']",Low
4,1938,1938,451296996490346496,"unbelievable img ""@Larryferlazzo RT @stevesilb...",chileEarthquake2014,0,157,"<a href=""http://twitter.com/download/android"" ...",2012-05-31 23:06:54,2014-04-02 09:56:03,en,WyattChrisJ,Australia,The 2014 Iquique earthquake struck off the coa...,"['MultimediaShare', 'News']",Low


In [31]:
'''
    Converting tweet column to str
'''
df['Tweet'] = df['Tweet'].astype('str')

In [32]:
'''
    Generalise process to all files. Maybe later
'''
event_type = ['Floods', 'Earthquake', 'Bushfire', 'Bombings', 'Tornado', 'Attack', 'SchoolShooting', 'typhoon' ]

In [109]:
'''
    Tokenize each tweet into words. Note we haven't yet removed stop words
'''
token_array = []
for tweet in df['Tweet']:
    token_tweet = word_tokenize(tweet)
    token_array.append(token_tweet)
                       
token_array[1]

['Powerful',
 'earthquake',
 'strikes',
 'off',
 'the',
 'coast',
 'of',
 'Chile',
 ':']

In [110]:
'''
    Will remove stop words from tweet. We still have to look into removing punctuation marks.
'''
stop_words=set(stopwords.words("english"))
filtered_token_array=[]
for tweet in token_array:
    filtered_tweet = []
    for word in tweet:
        if word not in stop_words:
            filtered_tweet.append(word)
    filtered_token_array.append(filtered_tweet)
    
filtered_token_array[1]

['Powerful', 'earthquake', 'strikes', 'coast', 'Chile', ':']

In [111]:
'''
    We will now do stemming. This is the process of removing different forms of the same word and will
    resort to the root word. For example, connection, connected, connecting word reduce to a common 
    word "connect".
'''
ps = PorterStemmer()
stemmed_array=[]
for tweet in filtered_token_array:
    stemmed_tweet = []
    for word in tweet:
        stemmed_tweet.append(ps.stem(word))
    stemmed_array.append(stemmed_tweet)
    
stemmed_array[1]

['power', 'earthquak', 'strike', 'coast', 'chile', ':']

In [112]:
'''
    We now do lemmatization. This is like stemming but more effective apparently as it does a dictionary lookup. For 
    instance a relation between the words good and better may be made in lemmatisation but not in stemming.
    
    Lemmatization is much better from a cursory look. Words like earthquake are being cut down to earthquak 
    when using stemming. 
'''
lem = WordNetLemmatizer()
stem = PorterStemmer()

lemmatized_array=[]
for tweet in filtered_token_array:
    lemmatized_tweet = []
    for word in tweet:
        lemmatized_tweet.append(lem.lemmatize(word,'v'))
    lemmatized_array.append(lemmatized_tweet)
    
lemmatized_array[1]

['Powerful', 'earthquake', 'strike', 'coast', 'Chile', ':']

In [96]:
'''
    DTM to get bag of words
'''
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['Tweet'])
text_counts_dense = text_counts.todense()

In [95]:
'''
    DTM to get TF-IDF features
'''
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['Tweet'])
text_tf_dense = text_tf.todense()

In [99]:
text_counts_dense.shape

(8252, 15025)

In [101]:
text_tf_dense.shape

(8252, 15437)

In [105]:
text_counts_dense

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [106]:
np.count_nonzero(text_counts_dense)

82053

In [107]:
8252*15025

123986300