In [14]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## About Data

Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster. Take this example:

<img src="assets/tweet_screenshot.png" alt="Drawing" style="width: 200px;"/>

The author explicitly uses the word “ABLAZE” but means it metaphorically. This is clear to a human right away, especially with the visual aid. But it’s less clear to a machine.

-------
Columns: 

id - a unique identifier for each tweet

text - the text of the tweet

location - the location the tweet was sent from (may be blank)

keyword - a particular keyword from the tweet (may be blank)

target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)


## Data Prepration

### Reading data

In [15]:
df_train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')

### Investigating the dataset

In [16]:
def background_color(value):
    if isinstance(value, str):
        return 'background-color: #a6c0ed'
    return ''

def show_df(df_train):
    print('shape'.center(30,'_'))
    display(df_train.shape)

    print('head'.center(30,'_'))
    display(df_train.head().style.background_gradient(cmap='Blues'))

    print('tail'.center(30,'_'))
    display(df_train.tail().style.background_gradient(cmap='Blues'))

    print('info'.center(30,'_')+'\n')
    display(df_train.info())

    print('describe_continuous'.center(30,'_'))
    display(df_train.describe().T.style.background_gradient(cmap = 'Blues'))

    print('describe_categorical'.center(30,'_'))
    display(df_train.describe(include='object').T.style.background_gradient(cmap='Blues'))

    print('null_values_percent'.center(30,'_'))
    display((df_train.isna().sum() / len(df_train) * 100).sort_values(ascending=False))
show_df(df_train)

____________shape_____________


(7613, 5)

_____________head_____________


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


_____________tail_____________


Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1
7612,10873,,,The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d,1


_____________info_____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


None

_____describe_continuous______


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,7613.0,5441.934848,3137.11609,1.0,2734.0,5408.0,8146.0,10873.0
target,7613.0,0.42966,0.49506,0.0,0.0,0.0,1.0,1.0


_____describe_categorical_____


Unnamed: 0,count,unique,top,freq
keyword,7552,221,fatalities,45
location,5080,3341,USA,104
text,7613,7503,11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...,10


_____null_values_percent______


location    33.272035
keyword      0.801261
id           0.000000
text         0.000000
target       0.000000
dtype: float64

### Preprocessing

In [17]:
from utils.preprocessor import Preprocessor

In [18]:
preprocessor = Preprocessor(
    remove_url=True,
    remove_punct=True,
    remove_stopwords=True,
    tokenize_words=True,
    lemmatize_words=True,
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Removing URL's from the dataset

In [19]:
print("Text Before:\n", df_train.text[32])
print("Text After:\n",preprocessor._remove_URL(df_train.text[32]))

Text Before:
 We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw
Text After:
 We always try to bring the heavy. #metal #RT 


#### Removing punctuations from the dataset

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
print("Text Before:\n", df_train.text[2])
print("Text After:\n", preprocessor._remove_punct(df_train.text[2]))

Text Before:
 All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
Text After:
 All residents asked to shelter in place are being notified by officers No other evacuation or shelter in place orders are expected


#### Removing stop words

In [22]:
from nltk.corpus import stopwords
# Stop words example
stopwords.words("english")[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [23]:
print("Text Before:\n", df_train.text[2])
print("Text After:\n", preprocessor._remove_stopwords(df_train.text[2]))

Text Before:
 All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
Text After:
 residents asked 'shelter place' notified officers. evacuation shelter place orders expected


### Now let's apply our cleaning methods

In [24]:
df_train['text'] = df_train.text.map(preprocessor._remove_URL)
df_train['text'] = df_train.text.map(preprocessor._remove_punct)
df_train['text'] = df_train.text.map(preprocessor._remove_stopwords)
df_train['text']

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

-------------

### Tokenization

In [25]:
df_train["text"] = df_train.text.map(preprocessor._tokenize_words)
df_train.text.head()

### Lemmatizing the tokens

In [None]:
df_train["text"] = df_train.text.map(preprocessor._lemmatize_words)
df_train.text.head()

0    [deed, reason, earthquake, may, allah, forgive...
1        [forest, fire, near, la, ronge, sask, canada]
2    [resident, asked, shelter, place, notified, of...
3    [13000, people, receive, wildfire, evacuation,...
4    [got, sent, photo, ruby, alaska, smoke, wildfi...
Name: text, dtype: object