# Data cleaning 
The current notebook encapsulated all text preprocessing and feature engineering tasks that are relevant for the current text data. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick 
import matplotlib.dates as mdates
from matplotlib.ticker import PercentFormatter, FuncFormatter
%matplotlib inline

import sqlalchemy

from cycler import cycler

import seaborn as sns
sns.set()

import googletrans
from googletrans import Translator

import regex as re
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk import tokenize # word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk 
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textacy import preprocessing
import textacy
from langdetect import detect

import spacy
nlp = spacy.load('en_core_web_sm')
import os 
# environment settings
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/asyagadzhalova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/asyagadzhalova/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/asyagadzhalova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/asyagadzhalova/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
os.getcwd()

'/Users/asyagadzhalova/Documents/GitHub/disaster_messages_classification/notebooks'

In [5]:
os.chdir('..')

In [6]:
df=pd.read_pickle(os.getcwd()+'/data/data_after_eda.pkl')

In [7]:
#df.isnull().sum()

## Text processing 

### Text preprocessing - text cleaning 
The actions on translation /where the text is not in english/, removal of noise, normalization

In [7]:
df.head()

Unnamed: 0,id,message,genre,trans_ind,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,message_clean,lang,lemmas,adjs_verbs,nouns,noun_phrases,adj_noun_phrases,entities
0,2,Weather update - a cold front from Cuba that c...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,weather update a cold front from cuba that co...,en,"[weather, update, cold, front, from, cuba, cou...","[cold, pass]","[weather, update, cuba, haiti]",[weather_update],[cold_front],[cuba/GPE]
1,7,Is the Hurricane over or is it not over,direct,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,is the hurricane over or is it not over,en,"[be, hurricane, over, or, be, over]",[],[hurricane],[],[],[]
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,un reports leogane destroyed only hospital st...,en,"[un, report, leogane, destroy, only, hospital,...","[report, destroy, need]","[un, leogane, hospital, st, croix, functioning...",[croix_functioning],[],[un/ORG]
4,12,"says: west side of Haiti, rest of the country ...",direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,says west side of haiti rest of the country to...,en,"[say, west, side, of, haiti, rest, of, country...","[say, west, haiti]","[rest, country, today, tonight]",[country_today],"[west_side, haiti_rest]",[]
5,14,Information about the National Palace-,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,information about the national palace,en,"[information, about, national, palace]",[],"[information, national, palace]",[],[],[national_palace/ORG]


#### Text & noise cleaning 

In [8]:
def text_cleaner(serie):
    '''Function to normalize data, clean special characters, empty string, noise
    '''
    #lower case
    serie = serie.astype(str).str.lower()
    #cleaning
    serie= serie.str.replace('://www.([\w\-\.]+\S+)','') #replace URL
    serie= serie.str.replace('[^\w\s]|\b\w{1,2}\b|\d+','') #remove digit, less than 2 chars
    serie= serie.str.replace('\s{3,}','empty_string') #replace empty string 
    return serie

In [9]:
df['message_clean'] = text_cleaner(df['message'])

  serie= serie.str.replace('://www.([\w\-\.]+\S+)','') #replace URL
  serie= serie.str.replace('[^\w\s]|\b\w{1,2}\b|\d+','') #remove digit, less than 2 chars
  serie= serie.str.replace('\s{3,}','empty_string') #replace empty string


In [10]:
df['message_clean'].isna().sum()

0

In [11]:
#drop the rows with empty string - they do not contain any information in the message
df.index[df['message_clean']=='empty_string'].values

array([ 7534, 12185, 12189, 12222])

In [12]:
df.drop(df.index[df['message_clean']=='empty_string'].values,axis=0, inplace=True)

In [13]:
df.shape

(26176, 40)

In [14]:
df.reset_index(inplace=True)

In [15]:
df.drop('index',axis=1,inplace=True)

#### Language detection and translation of non-english to english

In [16]:
'''
Function to detect the language of a given text
'''
def detect_language(text):
    if len(text)>10:
        lang = detect(text)
        return lang

In [17]:
df['lang'] = df['message_clean'].map(detect_language)

In [20]:
df[df['lang']=='fr']

Unnamed: 0,id,message,genre,trans_ind,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,message_clean,lang
117,146,Dans la zone de Saint Etienne la route de Jacm...,direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,dans la zone de saint etienne la route de jacm...,fr
334,407,ADDRESS CYBER CAFE IS MS NET ADRESS ITS HAITI ...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,address cyber cafe is ms net adress its haiti ...,fr
459,565,"Bonsoir, on est a bon repos aprs la compagnie ...",direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,bonsoir on est a bon repos aprs la compagnie t...,fr
548,670,help at 3rlle du travail au college lamartiner...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,help at rlle du travail au college lamartinere...,fr
558,682,"Laboule 12 prolongee, section Prosi NO FOOD NO...",direct,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,laboule prolongee section prosi no food no st...,fr
575,700,URGENT CRECHE ORPHANAGE KAY TOUT TIMOUN CROIX ...,direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,urgent creche orphanage kay tout timoun croix ...,fr
647,796,"we need food, water, toilets and security forc...",direct,1,1,1,0,1,0,1,0,1,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,we need food water toilets and security forces...,fr
654,804,elle est vraiment malade et a besoin d'aide. u...,direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,elle est vraiment malade et a besoin daide uti...,fr
1339,1584,ok tout le monde qui victime. paix pager gens ...,direct,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,ok tout le monde qui victime paix pager gens y...,fr
1466,1718,"We need medical help at climatec, rue aubran, ...",direct,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,we need medical help at climatec rue aubran op...,fr


The translator is not working. The only option is to drop the rows with language different that english - they are 1.6% of the total data, so we can drop them

In [18]:
df[df['lang']!='en'].shape[0]

427

In [19]:
df['lang'].value_counts()

en    25749
af       57
fr       56
id       49
nl       33
pt       31
ca       28
it       25
da       23
cy       20
es       19
so       16
sq       11
no        9
et        8
tl        5
hr        5
sv        5
pl        4
sl        3
fi        3
ro        2
de        2
tr        2
cs        1
lv        1
sw        1
sk        1
Name: lang, dtype: int64

In [20]:
df.shape

(26176, 41)

In [None]:
424/26176

In [21]:
df.drop(df.index[df['lang']!='en'].values,axis=0, inplace=True)

In [22]:
df.shape

(25749, 41)

In [23]:
def text_cleaner_tokens(serie, words= ['thank you','thanks','thank']):
    '''Remove special char, stop words and stem form a serie, input: serie, stop_words, min 
    words frequence'''
    #define stemmer
    st = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop= stopwords.words('english') + words
    #stop = words #I will not remove for now the stop words since they convey some meaning 
    stop = [st.stem(x) for x in stop]
    #cleaning
    serie= serie.str.replace('empty_string','') #remove the empty string 
    #define stemmer
    st = PorterStemmer()
    serie= serie.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split() 
                                           if st.stem(word) not in stop])) 
    return serie

In [24]:
df['tokens']=text_cleaner_tokens(df['message_clean'])

In [43]:
df.head(20)

Unnamed: 0,id,message,genre,trans_ind,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,message_clean,lang,lemmas,adjs_verbs,nouns,noun_phrases,adj_noun_phrases,entities,tokens
0,2,Weather update - a cold front from Cuba that c...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,weather update a cold front from cuba that co...,en,"[weather, update, cold, front, from, cuba, cou...","[cold, pass]","[weather, update, cuba, haiti]",[weather_update],[cold_front],[cuba/GPE],weather update cold front cuba could pas haiti
1,7,Is the Hurricane over or is it not over,direct,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,is the hurricane over or is it not over,en,"[be, hurricane, over, or, be, over]",[],[hurricane],[],[],[],hurricane
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,un reports leogane destroyed only hospital st...,en,"[un, report, leogane, destroy, only, hospital,...","[report, destroy, need]","[un, leogane, hospital, st, croix, functioning...",[croix_functioning],[],[un/ORG],un report leogane destroyed hospital st croix ...
4,12,"says: west side of Haiti, rest of the country ...",direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,says west side of haiti rest of the country to...,en,"[say, west, side, of, haiti, rest, of, country...","[say, west, haiti]","[rest, country, today, tonight]",[country_today],"[west_side, haiti_rest]",[],say west side haiti rest country today tonight
5,14,Information about the National Palace-,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,information about the national palace,en,"[information, about, national, palace]",[],"[information, national, palace]",[],[],[national_palace/ORG],information national palace
6,15,Storm at sacred heart of jesus,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,storm at sacred heart of jesus,en,"[storm, at, sacred, heart, of, jesus]",[sacred],"[storm, heart, jesus]",[],[sacred_heart],[],storm sacred heart jesus
7,16,"Please, we need tents and water. We are in Sil...",direct,1,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,please we need tents and water we are in silo ...,en,"[please, need, tent, and, water, be, in, silo,...","[need, thank]","[tent, water, silo]",[],[],[],please need tent water silo
8,17,"I would like to receive the messages, thank you",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,i would like to receive the messages thank you,en,"[would, like, receive, message, thank]","[like, receive, thank]",[message],[],[],[],would like receive message
9,18,I am in Croix-des-Bouquets. We have health iss...,direct,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,i am in croixdesbouquets we have health issues...,en,"[be, in, croixdesbouquet, have, health, issue,...",[],"[croixdesbouquet, health, issue, worker, santo...",[health_issue],[],[],croixdesbouquets health issue worker santoan a...
10,20,"There's nothing to eat and water, we starving ...",direct,1,1,1,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,1,theres nothing to eat and water we starving an...,en,"[s, eat, and, water, starve, and, thirsty]","[s, eat, water, starve, thirsty]",[],[],[],[],nothing eat water starving thirsty


In [25]:
df.to_pickle(os.getcwd()+'/data/data_after_text_processing.pkl')