In [75]:
ORIGINAL_PATH = "../data/original/"

---
## Read file with datas

In [76]:
import pandas as pd

In [77]:
df = pd.read_csv(ORIGINAL_PATH + r"train.csv")

In [78]:
df.head(15)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


---
## Drop None(na) values

Delete rows with empty text values.
These lines will not be useful to us because there is nothing to explore in them.

In [79]:
df['text'].dropna(inplace=True)

---
## To lower case

Convert all words to lowercase to avoid a situation where the words **Mango** and **mango** are considered different.

In [80]:
df['lower_text'] = [entry.lower() for entry in df['text']]

---
## Remove contractions

Change contractions __(for ex: can't've)__ to the full form of the word.
**Ain't** and **am not** have the same meaning, but it will be easier for the program to understand if the first and second text is the same.



In [81]:
import json

In [82]:
with open('../notebooks/en_contractions.json') as file:
    contractions = json.load(file)

In [83]:
def remove_contractions(text: str) -> str:
    """
    Function to replace contractions with their longer forms

    Args:
    string text: text to replace contractions

    Returns:
    string: replaced text
    """
    new_test = []
    for t in text.split():
        if t.lower() in contractions.keys():
            new_test.append(contractions[t.lower()])
        else:
            new_test.append(t)

    return ' '.join(new_test)

    ## TODO: ????
    # assert 'contractions' in globals(), "Json file with contractions not loaded"
    # return contractions[text.lower()] if text.lower() in contractions.keys() else text

In [84]:
df['without_contractions']=df['lower_text'].apply(remove_contractions)

---
## Data cleaning from noise

Removing noise from each line.
Noise is considered to be things that do not carry the useful information that the user wants to convey in the text. (№,#,%, hyperlinks, URL, etc.)

Emojis may also appear in the text. In some situations, they make sense. Script converts them into text for further use.

In [85]:
import re, itertools, emoji

In [86]:
def clean_dataset(text: str) -> str:
    """
    Function to get rif off unwanted patterns
    Args:
    string text: text to clean

    Returns:
    string: replaced text
    """
    # Remove hashtag while keeping hashtag text
    text = re.sub(r'#','', text)
    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    # Remove tickers
    text = re.sub(r'\$\w*', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+','', text)
    text = re.sub(r'[ ]{2, }',' ',text)
    # Remove URL, RT, mention(@)
    text=  re.sub(r'http(\S)+', '',text)
    text=  re.sub(r'http ...', '',text)
    text=  re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+','',text)
    text=  re.sub(r'RT[ ]?@','',text)
    text = re.sub(r'@[\S]+','',text)

    # TODO: why?? example id:13
    #Remove words with 4 or fewer letters
    #text = re.sub(r'\b\w{1,4}\b', '', text)


    #&, < and >
    text = re.sub(r'&amp;?', 'and',text)
    text = re.sub(r'&lt;','<',text)
    text = re.sub(r'&gt;','>',text)
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text= ''.join(c for c in text if c <= '\uFFFF')
    text = text.strip()
    # Remove misspelling words
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    # TODO:
    # Remove emoji
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split())
    text = re.sub("([^\x00-\x7F])+"," ",text)
    # Remove Mojibake (also extra spaces)
    text = ' '.join(re.sub("[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text).split())
    return text

In [87]:
df['without_noise'] = df['without_contractions'].apply(clean_dataset)

---
## Remove stopwords

Stop words - These are words that do not carry a semantic load, so their usefulness and role for searching is not significant.

In [88]:
from wordcloud import STOPWORDS as stop
import string

In [89]:
# A collection of words and punctuation marks to remove from tweets
punctuation = list(string.punctuation)
stop.update(punctuation)

In [90]:
def remove_stopwords(text: str) -> str:
    """

    """
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip().replace('.',' ').replace(',',' '))
    return " ".join(final_text)

In [91]:
df['without_stopwords'] = df['without_noise'].apply(remove_stopwords)

---
## language detection??

In [92]:
#TODO: Eng or another

---
## Tokenization

Tokenization is the process of demarcating and possibly classifying sections of a string of input characters.
The resulting tokens are then passed on to some other form of processing.

In our case, we split the text into words.

In [93]:
from nltk.tokenize import word_tokenize

In [94]:
df['tokenized'] = df['without_stopwords'].apply(word_tokenize)

---
## Add parts of speech tags

Part of speech is a category of words that have similar grammatical properties.

- CC: conjunction, coordinating
- CD: numeral, cardinal
- DT: determiner
- EX: existential there
- IN: preposition or conjunction, subordinating
- JJ: adjective or numeral, ordinal
- JJR: adjective, comparative
- JJS: adjective, superlative
- LS: list item marker
- MD: modal auxiliary
- NN: noun, common, singular or mass
- NNP: noun, proper, singular
- NNS: noun, common, plural
- PDT: pre-determiner
- POS: genitive marker
- PRP: pronoun, personal
- PRP$: pronoun, possessive
- RB: adverb
- RBR: adverb, comparative
- RBS: adverb, superlative
- RP: particle
- TO: "to" as preposition or infinitive marker
- UH: interjection
- VB: verb, base form
- VBD: verb, past tense
- VBG: verb, present participle or gerund
- VBN: verb, past participle
- VBP: verb, present tense, not 3rd person singular
- VBZ: verb, present tense, 3rd person singular
- WDT: WH-determiner
- WP: WH-pronoun
- WRB: Wh-adverb

[Details](https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk)



In [95]:
from nltk.tag import pos_tag

In [96]:
try:
    df['pos_tags'] = df['tokenized'].apply(pos_tag)
except LookupError:
    import nltk
    nltk.download('averaged_perceptron_tagger')
    df['pos_tags'] = df['tokenized'].apply(pos_tag)


---
## Lemmatization

Lemmatization is the process of reducing a word form to a lemma.
Lemma — its normal (dictionary) form.

In [97]:
from nltk.stem import WordNetLemmatizer

In [98]:
def lemmatization(words: list) -> list:
    new_list = []
    l = WordNetLemmatizer()

    for w in words:
        new_list.append(l.lemmatize(w))

    return new_list

In [99]:
df['lemmatized'] = df['tokenized'].apply(lemmatization)

In [100]:
df.head(15)

Unnamed: 0,id,keyword,location,text,target,lower_text,without_contractions,without_noise,without_stopwords,tokenized,pos_tags,lemmatized
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...,our deeds are the reason of this #earthquake m...,our deeds are the reason of this earthquake ma...,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgiv...","[(deeds, NNS), (reason, NN), (earthquake, NN),...","[deed, reason, earthquake, may, allah, forgive..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J...","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...,all residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...,residents asked shelter place notified officer...,"[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)...","[resident, asked, shelter, place, notified, of..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...","13,000 people receive #wildfires evacuation or...",13 00 people receive wildfires evacuation orde...,13 00 people receive wildfires evacuation orde...,"[13, 00, people, receive, wildfires, evacuatio...","[(13, CD), (00, CD), (people, NNS), (receive, ...","[13, 00, people, receive, wildfire, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...,just got sent this photo from ruby #alaska as ...,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfires pou...,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...","[got, sent, photo, ruby, alaska, smoke, wildfi..."
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,#rockyfire update => california hwy. 20 closed...,#rockyfire update => california hwy. 20 closed...,rockyfire update california hwy 20 closed in b...,rockyfire update california hwy 20 closed dire...,"[rockyfire, update, california, hwy, 20, close...","[(rockyfire, NN), (update, NN), (california, N...","[rockyfire, update, california, hwy, 20, close..."
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,#flood #disaster heavy rain causes flash flood...,#flood #disaster heavy rain causes flash flood...,flood disaster heavy rain causes flash floodin...,flood disaster heavy rain causes flash floodin...,"[flood, disaster, heavy, rain, causes, flash, ...","[(flood, NN), (disaster, NN), (heavy, JJ), (ra...","[flood, disaster, heavy, rain, cause, flash, f..."
7,13,,,I'm on top of the hill and I can see a fire in...,1,i'm on top of the hill and i can see a fire in...,i am on top of the hill and i can see a fire i...,i am on top of the hill and i can see a fire i...,top hill see fire woods,"[top, hill, see, fire, woods]","[(top, JJ), (hill, NN), (see, VBP), (fire, NN)...","[top, hill, see, fire, wood]"
8,14,,,There's an emergency evacuation happening now ...,1,there's an emergency evacuation happening now ...,there is an emergency evacuation happening now...,there is an emergency evacuation happening now...,emergency evacuation happening now building ac...,"[emergency, evacuation, happening, now, buildi...","[(emergency, NN), (evacuation, NN), (happening...","[emergency, evacuation, happening, now, buildi..."
9,15,,,I'm afraid that the tornado is coming to our a...,1,i'm afraid that the tornado is coming to our a...,i am afraid that the tornado is coming to our ...,i am afraid that the tornado is coming to our ...,afraid tornado coming area,"[afraid, tornado, coming, area]","[(afraid, JJ), (tornado, NN), (coming, VBG), (...","[afraid, tornado, coming, area]"


---
## Save clear data

In [101]:
from datetime import datetime

In [102]:
CLEAR_PATH = r"../data/clear/"
NAME = f"clear_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')[:-3]}.csv"

In [103]:
df.to_csv(CLEAR_PATH+NAME, encoding='utf-8', index=False, sep='|')