In [1]:
import pandas as pd
import os

ROOT = os.pardir
train_raw_data = pd.read_csv(os.path.join(ROOT, "data/train.csv"))
test_raw_data = pd.read_csv(os.path.join(ROOT, "data/test.csv"))

In [2]:
train_raw_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test_raw_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Text Cleaning

### 1. Lower case

In [4]:
train_raw_data["text"] = train_raw_data["text"].str.lower()
test_raw_data["text"] = test_raw_data["text"].str.lower()

train_raw_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this #earthquake m...,1
1,4,,,forest fire near la ronge sask. canada,1
2,5,,,all residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,just got sent this photo from ruby #alaska as ...,1


### 2. Remove URLs

In [5]:
import text_cleaning

In [6]:
train_raw_data[["id", "text"]].query("id in [5557, 3619, 5961, 3636, 9156]").values

array([[3619,
        'did josephus get it wrong about antiochus epiphanes and the abomination of desolation? read more: http://t.co/fwj9ccyw6k'],
       [3636,
        'emotional desolation the effect of alcoholism/addiction on family - http://t.co/31tgtlz3ya forgiving is hard http://t.co/c7rco2emwf'],
       [5557,
        "100  1' mix new flat double sided linerless bottle caps you choose mix flattened - full re\x89û_ http://t.co/w00kjprfdr http://t.co/mixl1pfrje"],
       [5961,
        'freak accident? sure. looking for someone to blame? maybe. remember that player broke his leg cuz cart was at back of end zone? common sense'],
       [9156,
        'suicide bomber kills 15 in saudi security site mosque http://t.co/yytkp1z5kg via @reuters']],
      dtype=object)

In [7]:
url_pattern = r"(\w+:\/\/\S+)"
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.remove_URL, url_pattern=url_pattern)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.remove_URL, url_pattern=url_pattern)

train_raw_data[["id", "text"]].query("id in [5557, 3619, 5961, 3636, 9156]").values

array([[3619,
        'did josephus get it wrong about antiochus epiphanes and the abomination of desolation? read more:  '],
       [3636,
        'emotional desolation the effect of alcoholism/addiction on family -   forgiving is hard  '],
       [5557,
        "100  1' mix new flat double sided linerless bottle caps you choose mix flattened - full re\x89û_    "],
       [5961,
        'freak accident? sure. looking for someone to blame? maybe. remember that player broke his leg cuz cart was at back of end zone? common sense'],
       [9156,
        'suicide bomber kills 15 in saudi security site mosque   via @reuters']],
      dtype=object)

### 3. Handle HTML

In [8]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.decode_html)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.decode_html)

train_raw_data["text"].sample(5).values



array(['texas seeks comment on rules for changes to windstorm insurer   via @ijournal',
       'does anyone know why #murfreesboro #walmart was evacuated this evening? @dnj_com',
       'tomorrow kick off your weekend with drinks & entertainment @aliveafter5    ',
       "@duchovbutt @starbuck_scully @madmakny @davidduchovny yeah we survived 9 seasons and 2 movies. let's hope for the good. there's hope ??????",
       '[withering] to death. is an album found when he [undermine]d his backyard because his cat [inundated] the floor [mustering] cat food.'],
      dtype=object)

### 4. Remove Twitter handles

In [9]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.remove_twitter_handles, replace_with_token=True)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.remove_twitter_handles, replace_with_token=True)

train_raw_data[["id", "text"]].query("id in [5557, 3619, 5961, 3636, 9156]").values

array([[3619,
        'did josephus get it wrong about antiochus epiphanes and the abomination of desolation? read more:  '],
       [3636,
        'emotional desolation the effect of alcoholism/addiction on family -   forgiving is hard  '],
       [5557,
        "100  1' mix new flat double sided linerless bottle caps you choose mix flattened - full re\x89û_    "],
       [5961,
        'freak accident? sure. looking for someone to blame? maybe. remember that player broke his leg cuz cart was at back of end zone? common sense'],
       [9156,
        'suicide bomber kills 15 in saudi security site mosque   via user']],
      dtype=object)

### 5. Remove Hashtags

In [10]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.remove_hashtags)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.remove_hashtags)

train_raw_data[["id", "text"]].head()

Unnamed: 0,id,text
0,1,our deeds are the reason of this earthquake ma...
1,4,forest fire near la ronge sask. canada
2,5,all residents asked to 'shelter in place' are ...
3,6,"13,000 people receive wildfires evacuation ord..."
4,7,just got sent this photo from ruby alaska as s...


### 6. Expand Contractions.

In [11]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.expand_contractions)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.expand_contractions)

train_raw_data[["id", "text"]].head()

Unnamed: 0,id,text
0,1,our deeds are the reason of this earthquake ma...
1,4,forest fire near la ronge sask. canada
2,5,all residents asked to 'shelter in place' are ...
3,6,"13,000 people receive wildfires evacuation ord..."
4,7,just got sent this photo from ruby alaska as s...


### 7. Remove Repeating Characters

In [12]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.remove_repeating_char)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.remove_repeating_char)

train_raw_data[["id", "text"]].head()

Unnamed: 0,id,text
0,1,our deeds are the reason of this earthquake ma...
1,4,forest fire near la ronge sask. canada
2,5,all residents asked to 'shelter in place' are ...
3,6,"13,000 people receive wildfires evacuation ord..."
4,7,just got sent this photo from ruby alaska as s...


### 8. Translate Emojis to text

In [13]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.decode_emojis)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.decode_emojis)

train_raw_data[["id", "text"]].head()

Unnamed: 0,id,text
0,1,our deeds are the reason of this earthquake ma...
1,4,forest fire near la ronge sask. canada
2,5,all residents asked to 'shelter in place' are ...
3,6,"13,000 people receive wildfires evacuation ord..."
4,7,just got sent this photo from ruby alaska as s...


### 9. Remove Unicode

In [14]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.remove_unicode)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.remove_unicode)

In [15]:
for idx, row in train_raw_data.iterrows():
    uniflag = False
    for each_char in row["text"]:
        if ord(each_char) > 127:
            uniflag = True
            print(each_char)
            break
    
    if uniflag:
        print("Unicode found in text = ", row["text"])   

### 10. Remove Punctuations

In [16]:
train_raw_data["text"] = train_raw_data["text"].apply(text_cleaning.remove_punctuations)
test_raw_data["text"] = test_raw_data["text"].apply(text_cleaning.remove_punctuations)

train_raw_data[["id", "text"]].head()

Unnamed: 0,id,text
0,1,our deeds are the reason of this earthquake ma...
1,4,forest fire near la ronge sask canada
2,5,all residents asked to shelter in place are be...
3,6,13000 people receive wildfires evacuation orde...
4,7,just got sent this photo from ruby alaska as s...


## Save Cleaned data

In [17]:
train_raw_data[["id", "text", "target"]].to_csv("../data/cleaned_train.csv", index=False)

In [18]:
test_raw_data[["id", "text"]].to_csv("../data/cleaned_test.csv", index=False)