This notebook relates to the preparation of the Messages dataframe

Importing the libraries needed and the dataframe

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import hstack

messages = pd.read_csv('data/disaster_messages.csv')


messages.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [2]:
messages.shape

(26248, 4)

The genre feature doesn't seem to relate for our problem, which is predecting the category of the message, however since, as we can see below there is only 3 types, resulting in 2 features (after get_dummies, with the drop_first option activated) we'll keep it for now and came back to it if the correlation proves this assumption.

In [3]:
genres = messages.genre.str.get_dummies()
genres.head()

Unnamed: 0,direct,news,social
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


Exploration shows us that the original column is the message in the original language so there is no need to keep it.

In [4]:
messages = messages.drop(columns=['original'])
messages.head()

Unnamed: 0,id,message,genre
0,2,Weather update - a cold front from Cuba that c...,direct
1,7,Is the Hurricane over or is it not over,direct
2,8,Looking for someone but no name,direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",direct


At a first glance it seems better to treat each message as a document and build a document-term matrix, we may however end up with a matrix with too many columns, but we'll evaluate this later on. But first we'll clean the text: Normalize followed by tokenize then removing stop words and finally lemmatize

In [5]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def tokenize(text):
    # normalize case, remove punctuation and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    #lemmatize verbs
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    
    #lemmatize adjectives
    tokens = [lemmatizer.lemmatize(word, pos='a') for word in tokens]
    
    #lemmatize adverbs
    tokens = [lemmatizer.lemmatize(word, pos='r') for word in tokens]
    
    

    return tokens

In [6]:
# initialize count vectorizer object
vect = CountVectorizer(tokenizer=tokenize)

# get counts of each token (word) in entire dataframe
X_messages = vect.fit_transform(messages['message'])

In [7]:
#dictionary with the word count in the entire dataframe
vect.vocabulary_

{'weather': 26387,
 'update': 25550,
 'cold': 4541,
 'front': 8930,
 'cuba': 5441,
 'could': 5181,
 'pas': 17956,
 'haiti': 10057,
 'hurricane': 10913,
 'look': 14146,
 'someone': 22621,
 'name': 16209,
 'un': 25164,
 'report': 20390,
 'leogane': 13768,
 'destroy': 6135,
 'hospital': 10735,
 'st': 22929,
 'croix': 5372,
 'function': 9007,
 'need': 16382,
 'supply': 23451,
 'desperately': 6107,
 'say': 21446,
 'west': 26462,
 'side': 22190,
 'rest': 20515,
 'country': 5202,
 'today': 24445,
 'tonight': 24520,
 'information': 11577,
 'national': 16275,
 'palace': 17772,
 'storm': 23094,
 'sacred': 21043,
 'heart': 10339,
 'jesus': 12282,
 'please': 18636,
 'tent': 23976,
 'water': 26309,
 'silo': 22232,
 'thank': 24083,
 'would': 26766,
 'like': 13911,
 'receive': 20016,
 'message': 15188,
 'de': 5745,
 'bouquet': 2955,
 'health': 10330,
 'issue': 12039,
 'worker': 26741,
 'santo': 21352,
 'area': 1355,
 'nothing': 16809,
 'eat': 7092,
 'starve': 22983,
 'thirsty': 24207,
 'petionville':

In [8]:
X_messages.shape #(26248,31967)was the shape without removing numbers and adding lemmatizers for adjectives and adverbs

(26248, 27356)

In [9]:
#importing the dataframe cleaned in the categories notebook
categories = pd.read_csv('data/categories_prepared_rows_removed.csv')
categories = categories.rename(columns={'Unnamed: 0': 'old_index'})
categories.tail()

Unnamed: 0,old_index,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
19925,26240,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19926,26242,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19927,26245,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19928,26246,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
19929,26247,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#merging the two dataframes
df = pd.merge(messages,categories, left_index=True, right_on='old_index' )

#dropping the columns used to merge as they will no longer be needed
df = df.drop(columns=['id','old_index'])

#creating dummies from the genre column
df = pd.get_dummies(df,columns=['genre'],drop_first=True)

df.tail(), df.shape

(                                                 message  related  request  \
 19925  The delivery was made in conjunction with the ...        1        0   
 19926  Hpakant, an area rich with coveted jade stones...        1        0   
 19927  Proshika, operating in Cox's Bazar municipalit...        1        0   
 19928  Some 2,000 women protesting against the conduc...        1        0   
 19929  A radical shift in thinking came about as a re...        1        0   
 
        offer  aid_related  medical_help  medical_products  search_and_rescue  \
 19925      0            0             0                 0                  0   
 19926      0            0             0                 0                  0   
 19927      0            0             0                 0                  0   
 19928      0            1             0                 0                  0   
 19929      0            0             0                 0                  0   
 
        security  military  ...  wea

In [11]:
categories.shape

(19930, 36)

In [12]:
# get counts of each token (word) in the merged dataframe with the row reduction
X = vect.fit_transform(df['message'])
X.shape

(19930, 23369)

In [13]:
# initialize tf-idf transformer object
transformer = TfidfTransformer(smooth_idf=False)
# use counts from count vectorizer results to compute tf-idf values
tfidf = transformer.fit_transform(X)

tfidf.shape

(19930, 23369)

In [14]:
#adding the genre dummies columns to the document term matrix we have just created
X_train = hstack((tfidf,np.array(df['genre_news'])[:,None]))
X_train = hstack((X_train,np.array(df['genre_social'])[:,None]))
X_train.shape

(19930, 23371)

In [15]:
#creating our target dataframe
y =  df.drop(columns=['genre_news','genre_social','message'])
y.shape

(19930, 35)

In [16]:
y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
