In [271]:
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from spellchecker import SpellChecker

import re


In [272]:
dataset = pd.read_csv('train.csv')
dataset.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [273]:
keywords_missing = dataset['keyword'].isna().sum()/dataset['keyword'].isna().count()*100
location_missing = dataset['location'].isna().sum()/dataset['location'].isna().count()*100

print('In our Dataset we have ', keywords_missing.round(2), '% of keywords missing, and ', location_missing.round(2),'% of location data missing')

In our Dataset we have  0.8 % of keywords missing, and  33.27 % of location data missing


In [274]:
# Adding values to keyword and location
dataset.keyword = dataset.keyword.apply(lambda x: "nothing" if pd.isna(x) else x)
dataset.location = dataset.location.apply(lambda x: "unknown" if pd.isna(x) else x)
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,nothing,unknown,Our Deeds are the Reason of this #earthquake M...,1
1,4,nothing,unknown,Forest fire near La Ronge Sask. Canada,1
2,5,nothing,unknown,All residents asked to 'shelter in place' are ...,1
3,6,nothing,unknown,"13,000 people receive #wildfires evacuation or...",1
4,7,nothing,unknown,Just got sent this photo from Ruby #Alaska as ...,1


In [275]:
def text_cleaner(text):
    text = str(text).lower()

    text = re.sub(r'https?://\S+', ' URL ', text)
    text = re.sub(r'@([A-Za-z0-9_]+)', ' Mention ', text)
    text = re.sub(r'!', ' E_M ', text) # Exclamation mark
    text = re.sub(r'\?', ' Q_M ', text) # Question mark
    text = re.sub(r'[^\x00-\x7F]+', ' UncnownCaracters ', text)
    text = re.sub(r'[^a-zA-Z# -]', '', text)
    text = re.sub(r'[\n\t\r]', ' ', text)
    text = ' '.join(text.split())
    
    """ text= re.sub("[" u"\U0001F600-\U0001F64F"  # emoticons
                     u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                     u"\U0001F680-\U0001F6FF"  # transport & map symbols
                     u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                     u"\U00002702-\U000027B0"
                     u"\U000024C2-\U0001F251" "]+",'',text,flags=re.UNICODE)
    text = re.sub('\[.*?\]', '', text) # remove empty
    text = re.sub('https?://\S+|www\.\S+', 'URL', text) #urls

    text = re.sub('\n', '', text) # newline
    text = re.sub('\w*\d\w*', '', text) # numeric data
    text = re.sub('[^\w\s]', '', text) """

    return text

def correct_spelling(sentence):
    spell = SpellChecker()
    corrected_words = []

    for word in sentence.split():
        if not spell.correction(word) == word:
            corrected_words.append(spell.correction(word))
        else:
            corrected_words.append(word)
    return " ".join(map(str,corrected_words))

def process_hashtags(arr):
    i = 0
    while i < len(arr):
        if arr[i] == '#':
            arr.pop(i)
            if i < len(arr):
                arr[i] = '#' + arr[i]
        i += 1
    return arr

In [276]:
dataset['text'] = dataset['text'].apply(lambda x: text_cleaner(x))
#dataset['text'] = dataset['text'].apply(lambda x: correct_spelling(x)) #Execute it later!!!
dataset['text']

0       our deeds are the reason of this #earthquake m...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       people receive #wildfires evacuation orders in...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    Mention Mention the out of control wild fires ...
7610                  m utc QM km s of volcano hawaii URL
7611    police investigating after an e-bike collided ...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [277]:
dataset['text'] = dataset['text'].apply(lambda x: word_tokenize(x))
dataset['text'] = dataset['text'].apply(process_hashtags)
dataset['text']




0       [our, deeds, are, the, reason, of, this, #eart...
1           [forest, fire, near, la, ronge, sask, canada]
2       [all, residents, asked, to, shelter, in, place...
3       [people, receive, #wildfires, evacuation, orde...
4       [just, got, sent, this, photo, from, ruby, #al...
                              ...                        
7608    [two, giant, cranes, holding, a, bridge, colla...
7609    [Mention, Mention, the, out, of, control, wild...
7610        [m, utc, QM, km, s, of, volcano, hawaii, URL]
7611    [police, investigating, after, an, e-bike, col...
7612    [the, latest, more, homes, razed, by, northern...
Name: text, Length: 7613, dtype: object

In [278]:
dataset.text.iloc[5]

['#rockyfire',
 'update',
 'california',
 'hwy',
 'closed',
 'in',
 'both',
 'directions',
 'due',
 'to',
 'lake',
 'county',
 'fire',
 '-',
 '#cafire',
 '#wildfires']

In [279]:
stop_words = set(stopwords.words('english'))


dataset['text']

0       [our, deeds, are, the, reason, of, this, #eart...
1           [forest, fire, near, la, ronge, sask, canada]
2       [all, residents, asked, to, shelter, in, place...
3       [people, receive, #wildfires, evacuation, orde...
4       [just, got, sent, this, photo, from, ruby, #al...
                              ...                        
7608    [two, giant, cranes, holding, a, bridge, colla...
7609    [Mention, Mention, the, out, of, control, wild...
7610        [m, utc, QM, km, s, of, volcano, hawaii, URL]
7611    [police, investigating, after, an, e-bike, col...
7612    [the, latest, more, homes, razed, by, northern...
Name: text, Length: 7613, dtype: object

In [280]:
dataset['text'] = dataset['text'].apply(lambda x: [word for word in x if word not in stop_words])
dataset['cleaned'] = dataset['text']
dataset['cleaned']



0       [deeds, reason, #earthquake, may, allah, forgi...
1           [forest, fire, near, la, ronge, sask, canada]
2       [residents, asked, shelter, place, notified, o...
3       [people, receive, #wildfires, evacuation, orde...
4       [got, sent, photo, ruby, #alaska, smoke, #wild...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [Mention, Mention, control, wild, fires, calif...
7610                  [utc, QM, km, volcano, hawaii, URL]
7611    [police, investigating, e-bike, collided, car,...
7612    [latest, homes, razed, northern, california, w...
Name: cleaned, Length: 7613, dtype: object

In [281]:
dataset.cleaned.iloc[5]

['#rockyfire',
 'update',
 'california',
 'hwy',
 'closed',
 'directions',
 'due',
 'lake',
 'county',
 'fire',
 '-',
 '#cafire',
 '#wildfires']

Lematization

In [282]:
dataset['cleaned'] = dataset['cleaned'].apply(lambda words: pos_tag(words))
#dataset['cleaned'] = dataset['cleaned'].apply(lambda words: [pos_tag(word_tokenize(word)) for word in words])
dataset['cleaned'].head()

0    [(deeds, NNS), (reason, NN), (#earthquake, VBP...
1    [(forest, JJS), (fire, NN), (near, IN), (la, J...
2    [(residents, NNS), (asked, VBD), (shelter, JJ)...
3    [(people, NNS), (receive, VBP), (#wildfires, N...
4    [(got, VBD), (sent, JJ), (photo, NN), (ruby, N...
Name: cleaned, dtype: object

In [283]:
dataset.cleaned.iloc[5]

[('#rockyfire', 'NN'),
 ('update', 'NN'),
 ('california', 'NN'),
 ('hwy', 'NN'),
 ('closed', 'VBD'),
 ('directions', 'NNS'),
 ('due', 'JJ'),
 ('lake', 'VBP'),
 ('county', 'NN'),
 ('fire', 'NN'),
 ('-', ':'),
 ('#cafire', 'NN'),
 ('#wildfires', 'NNS')]

In [284]:
def penn_to_wordnet(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [285]:
lemmatizer = WordNetLemmatizer()
dataset['cleaned'] = dataset['cleaned'].apply(lambda words: [lemmatizer.lemmatize(word[0], pos=penn_to_wordnet(word[1])) for word in words])
dataset['cleaned'] = dataset['cleaned'].apply(process_hashtags)
dataset['cleaned']

0       [deed, reason, #earthquake, may, allah, forgiv...
1           [forest, fire, near, la, ronge, sask, canada]
2       [resident, ask, shelter, place, notify, office...
3       [people, receive, #wildfires, evacuation, orde...
4       [get, sent, photo, ruby, #alaska, smoke, #wild...
                              ...                        
7608    [two, giant, crane, hold, bridge, collapse, ne...
7609    [Mention, Mention, control, wild, fire, califo...
7610                  [utc, QM, km, volcano, hawaii, URL]
7611    [police, investigate, e-bike, collide, car, li...
7612    [late, home, raze, northern, california, wildf...
Name: cleaned, Length: 7613, dtype: object

In [286]:
dataset.cleaned.iloc[5]

['#rockyfire',
 'update',
 'california',
 'hwy',
 'close',
 'direction',
 'due',
 'lake',
 'county',
 'fire',
 '-',
 '#cafire',
 '#wildfires']

In [287]:
dataset.cleaned.iloc[0]

['deed', 'reason', '#earthquake', 'may', 'allah', 'forgive', 'u']

In [289]:
df = pd.read_csv('train.csv')
dataset.text = df.text

dataset.to_csv('cleaned_dataset.csv')