In [417]:
ORIGINAL_PATH = "../data/original/"

---
## Read file with datas

In [418]:
import pandas as pd

In [419]:
df = pd.read_csv(ORIGINAL_PATH + r"train.csv")

In [420]:
df.head(15)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAf...,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


---
## Drop None(na) values

Delete rows with empty text values.
These lines will not be useful to us because there is nothing to explore in them.

In [421]:
df['text'].dropna(inplace=True)

---
## To lower case

Convert all words to lowercase to avoid a situation where the words **Mango** and **mango** are considered different.

In [422]:
df['lower_text'] = [entry.lower() for entry in df['text']]

---
## Remove contractions

Change contractions __(for ex: can't've)__ to the full form of the word.
**Ain't** and **am not** have the same meaning, but it will be easier for the program to understand if the first and second text is the same.



In [423]:
import json

In [424]:
with open('../notebooks/en_contractions.json') as file:
    contractions = json.load(file)

In [425]:
def remove_contractions(text: str) -> str:
    """
    Function to replace contractions with their longer forms

    Args:
    string text: text to replace contractions

    Returns:
    string: replaced text
    """
    new_test = []
    for t in text.split():
        if t.lower() in contractions.keys():
            new_test.append(contractions[t.lower()])
        else:
            new_test.append(t)

    return ' '.join(new_test)

    ## TODO: ????
    # assert 'contractions' in globals(), "Json file with contractions not loaded"
    # return contractions[text.lower()] if text.lower() in contractions.keys() else text

In [426]:
df['without_contractions']=df['lower_text'].apply(remove_contractions)

---
## Data cleaning from noise

Removing noise from each line.
Noise is considered to be things that do not carry the useful information that the user wants to convey in the text. (№,#,%, hyperlinks, URL, etc.)

Emojis may also appear in the text. In some situations, they make sense. Script converts them into text for further use.

In [427]:
import re, itertools, emoji

In [428]:
def clean_dataset(text: str) -> str:
    """
    Function to get rif off unwanted patterns
    Args:
    string text: text to clean

    Returns:
    string: replaced text
    """
    # Remove hashtag while keeping hashtag text
    text = re.sub(r'#','', text)
    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    # Remove tickers
    text = re.sub(r'\$\w*', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+','', text)
    text = re.sub(r'[ ]{2, }',' ',text)
    # Remove URL, RT, mention(@)
    text=  re.sub(r'http(\S)+', '',text)
    text=  re.sub(r'http ...', '',text)
    text=  re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+','',text)
    text=  re.sub(r'RT[ ]?@','',text)
    text = re.sub(r'@[\S]+','',text)

    # TODO: why?? example id:13
    #Remove words with 4 or fewer letters
    #text = re.sub(r'\b\w{1,4}\b', '', text)


    #&, < and >
    text = re.sub(r'&amp;?', 'and',text)
    text = re.sub(r'&lt;','<',text)
    text = re.sub(r'&gt;','>',text)
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text= ''.join(c for c in text if c <= '\uFFFF')
    text = text.strip()
    # Remove misspelling words
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    # TODO:
    # Remove emoji
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split())
    text = re.sub("([^\x00-\x7F])+"," ",text)
    # Remove Mojibake (also extra spaces)
    text = ' '.join(re.sub("[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text).split())
    return text

In [429]:
df['without_noise'] = df['without_contractions'].apply(clean_dataset)

---
## Remove stopwords

Stop words - These are words that do not carry a semantic load, so their usefulness and role for searching is not significant.

In [430]:
from wordcloud import STOPWORDS as stop
import string

In [431]:
# A collection of words and punctuation marks to remove from tweets
punctuation = list(string.punctuation)
stop.update(punctuation)

In [432]:
def remove_stopwords(text: str) -> str:
    """

    """
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip().replace('.',' ').replace(',',' '))
    return " ".join(final_text)

In [433]:
df['without_stopwords'] = df['without_noise'].apply(remove_stopwords)

---
## language detection??

In [434]:
#TODO: Eng or another

---
## Tokenization

Tokenization is the process of demarcating and possibly classifying sections of a string of input characters.
The resulting tokens are then passed on to some other form of processing.

In our case, we split the text into words.

In [435]:
from nltk.tokenize import word_tokenize

In [436]:
df['tokenized'] = df['without_stopwords'].apply(word_tokenize)

---
## Add parts of speech tags

Part of speech is a category of words that have similar grammatical properties.

- CC: conjunction, coordinating
- CD: numeral, cardinal
- DT: determiner
- EX: existential there
- IN: preposition or conjunction, subordinating
- JJ: adjective or numeral, ordinal
- JJR: adjective, comparative
- JJS: adjective, superlative
- LS: list item marker
- MD: modal auxiliary
- NN: noun, common, singular or mass
- NNP: noun, proper, singular
- NNS: noun, common, plural
- PDT: pre-determiner
- POS: genitive marker
- PRP: pronoun, personal
- PRP$: pronoun, possessive
- RB: adverb
- RBR: adverb, comparative
- RBS: adverb, superlative
- RP: particle
- TO: "to" as preposition or infinitive marker
- UH: interjection
- VB: verb, base form
- VBD: verb, past tense
- VBG: verb, present participle or gerund
- VBN: verb, past participle
- VBP: verb, present tense, not 3rd person singular
- VBZ: verb, present tense, 3rd person singular
- WDT: WH-determiner
- WP: WH-pronoun
- WRB: Wh-adverb

[Details](https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk)



In [437]:
from nltk.tag import pos_tag

In [438]:
try:
    df['pos_tags'] = df['tokenized'].apply(pos_tag)
except LookupError:
    import nltk
    nltk.download('averaged_perceptron_tagger')
    df['pos_tags'] = df['tokenized'].apply(pos_tag)


---
## Lemmatization

Lemmatization is the process of reducing a word form to a lemma.
Lemma — its normal (dictionary) form.

In [439]:
from nltk.stem import WordNetLemmatizer

In [440]:
def lemmatization(words: list) -> list:
    new_list = []
    l = WordNetLemmatizer()

    for w in words:
        new_list.append(l.lemmatize(w))

    return new_list

In [441]:
df['lemmatized'] = df['tokenized'].apply(lemmatization)

In [442]:
df.head(15)

Unnamed: 0,id,keyword,location,text,target,lower_text,without_contractions,without_noise,without_stopwords,tokenized,pos_tags,lemmatized
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this earthquake may allah forgive us all,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgive, us]","[(deeds, NNS), (reason, NN), (earthquake, NN), (may, MD), (allah, VB), (forgive, JJ), (us, PRP)]","[deed, reason, earthquake, may, allah, forgive, u]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, JJ), (ronge, NN), (sask, NN), (canada, NN)]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or...,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or...,all residents asked to shelter in place are being notified by officers no other evacuation or sh...,residents asked shelter place notified officers evacuation shelter place orders expected,"[residents, asked, shelter, place, notified, officers, evacuation, shelter, place, orders, expec...","[(residents, NNS), (asked, VBD), (shelter, JJ), (place, NN), (notified, VBD), (officers, NNS), (...","[resident, asked, shelter, place, notified, officer, evacuation, shelter, place, order, expected]"
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,"13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california",13 00 people receive wildfires evacuation orders in california,13 00 people receive wildfires evacuation orders california,"[13, 00, people, receive, wildfires, evacuation, orders, california]","[(13, CD), (00, CD), (people, NNS), (receive, JJ), (wildfires, NNS), (evacuation, NN), (orders, ...","[13, 00, people, receive, wildfire, evacuation, order, california]"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,got sent photo ruby alaska smoke wildfires pours school,"[got, sent, photo, ruby, alaska, smoke, wildfires, pours, school]","[(got, VBD), (sent, JJ), (photo, NN), (ruby, NN), (alaska, NN), (smoke, NN), (wildfires, VBZ), (...","[got, sent, photo, ruby, alaska, smoke, wildfire, pours, school]"
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAf...,1,#rockyfire update => california hwy. 20 closed in both directions due to lake county fire - #caf...,#rockyfire update => california hwy. 20 closed in both directions due to lake county fire - #caf...,rockyfire update california hwy 20 closed in both directions due to lake county fire cafire wild...,rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires,"[rockyfire, update, california, hwy, 20, closed, directions, due, lake, county, fire, cafire, wi...","[(rockyfire, NN), (update, NN), (california, NN), (hwy, NN), (20, CD), (closed, JJ), (directions...","[rockyfire, update, california, hwy, 20, closed, direction, due, lake, county, fire, cafire, wil..."
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1,"#flood #disaster heavy rain causes flash flooding of streets in manitou, colorado springs areas","#flood #disaster heavy rain causes flash flooding of streets in manitou, colorado springs areas",flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas,"[flood, disaster, heavy, rain, causes, flash, flooding, streets, manitou, colorado, springs, areas]","[(flood, NN), (disaster, NN), (heavy, JJ), (rain, NN), (causes, NNS), (flash, VBP), (flooding, V...","[flood, disaster, heavy, rain, cause, flash, flooding, street, manitou, colorado, spring, area]"
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1,i'm on top of the hill and i can see a fire in the woods...,i am on top of the hill and i can see a fire in the woods...,i am on top of the hill and i can see a fire in the woods,top hill see fire woods,"[top, hill, see, fire, woods]","[(top, JJ), (hill, NN), (see, VBP), (fire, NN), (woods, NNS)]","[top, hill, see, fire, wood]"
8,14,,,There's an emergency evacuation happening now in the building across the street,1,there's an emergency evacuation happening now in the building across the street,there is an emergency evacuation happening now in the building across the street,there is an emergency evacuation happening now in the building across the street,emergency evacuation happening now building across street,"[emergency, evacuation, happening, now, building, across, street]","[(emergency, NN), (evacuation, NN), (happening, VBG), (now, RB), (building, VBG), (across, IN), ...","[emergency, evacuation, happening, now, building, across, street]"
9,15,,,I'm afraid that the tornado is coming to our area...,1,i'm afraid that the tornado is coming to our area...,i am afraid that the tornado is coming to our area...,i am afraid that the tornado is coming to our area,afraid tornado coming area,"[afraid, tornado, coming, area]","[(afraid, JJ), (tornado, NN), (coming, VBG), (area, NN)]","[afraid, tornado, coming, area]"


---
## Save clear data

In [443]:
from datetime import datetime

In [444]:
CLEAR_PATH = r"../data/clear/"
NAME = f"clear_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')[:-3]}.csv"

In [445]:
df.to_csv(CLEAR_PATH+NAME, encoding='utf-8', index=False)