In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## About Data

Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster. Take this example:
![image.png](assets/tweet_screenshot.png)

The author explicitly uses the word “ABLAZE” but means it metaphorically. This is clear to a human right away, especially with the visual aid. But it’s less clear to a machine.

-------
Columns: 

id - a unique identifier for each tweet

text - the text of the tweet

location - the location the tweet was sent from (may be blank)

keyword - a particular keyword from the tweet (may be blank)

target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)


## Data Prepration

### Reading data

In [2]:
df_train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')

### Investigating the dataset

In [3]:
def background_color(value):
    if isinstance(value, str):
        return 'background-color: #a6c0ed'
    return ''

def show_df(df_train):
    print('shape'.center(30,'_'))
    display(df_train)

    print('head'.center(30,'_'))
    display(df_train.head().style.background_gradient(cmap='Blues'))

    print('tail'.center(30,'_'))
    display(df_train.tail().style.background_gradient(cmap='Blues'))

    print('info'.center(30,'_')+'\n')
    display(df_train.info())

    print('describe_continuous'.center(30,'_'))
    display(df_train.describe().T.style.background_gradient(cmap = 'Blues'))

    print('describe_categorical'.center(30,'_'))
    display(df_train.describe(include='object').T.style.background_gradient(cmap='Blues'))

    print('null_values_percent'.center(30,'_'))
    display((df_train.isna().sum() / len(df_train) * 100).sort_values(ascending=False))
show_df(df_train)

____________shape_____________


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


_____________head_____________


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


_____________tail_____________


Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1
7612,10873,,,The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d,1


_____________info_____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


None

_____describe_continuous______


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,7613.0,5441.934848,3137.11609,1.0,2734.0,5408.0,8146.0,10873.0
target,7613.0,0.42966,0.49506,0.0,0.0,0.0,1.0,1.0


_____describe_categorical_____


Unnamed: 0,count,unique,top,freq
keyword,7552,221,fatalities,45
location,5080,3341,USA,104
text,7613,7503,11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...,10


_____null_values_percent______


location    33.272035
keyword      0.801261
id           0.000000
text         0.000000
target       0.000000
dtype: float64

### Preprocessing

#### Removing URL's from the dataset

In [4]:
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


In [5]:
print("Text Before:\n", df_train.text[32])
print("Text After:\n",remove_URL(df_train.text[32]))

Text Before:
 We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw
Text After:
 We always try to bring the heavy. #metal #RT 


#### Removing punctuations from the dataset

In [6]:
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
print("Text Before:\n", df_train.text[2])
print("Text After:\n",remove_punct(df_train.text[2]))

Text Before:
 All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
Text After:
 All residents asked to shelter in place are being notified by officers No other evacuation or shelter in place orders are expected


#### Removing stop words

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [9]:
# Stop words example
stopwords.words("english")[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [10]:
def remove_stopwords(text):
    stop = set(stopwords.words("english"))
    
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [11]:
print("Text Before:\n", df_train.text[2])
print("Text After:\n",remove_stopwords(df_train.text[2]))

Text Before:
 All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
Text After:
 residents asked 'shelter place' notified officers. evacuation shelter place orders expected


#### Now let's apply our cleaning methods

In [12]:
df_train["text"] = df_train.text.map(remove_URL) # map(lambda x: remove_URL(x))
df_train["text"] = df_train.text.map(remove_punct)
df_train["text"] = df_train.text.map(remove_stopwords)
df_train["text"]

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

-------------

### Tokenization

In [13]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [14]:
df_train["text"] = df_train.text.map(nltk.tokenize.word_tokenize)
df_train.text.head()

0    [deeds, reason, earthquake, may, allah, forgiv...
1        [forest, fire, near, la, ronge, sask, canada]
2    [residents, asked, shelter, place, notified, o...
3    [13000, people, receive, wildfires, evacuation...
4    [got, sent, photo, ruby, alaska, smoke, wildfi...
Name: text, dtype: object

### Lemmatizing the tokens

In [15]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [16]:
def Lemmatize(sentence_tokens):
    lemmatizer = WordNetLemmatizer()
    result_sentence = []
    for token in sentence_tokens:
        result_sentence.append(lemmatizer.lemmatize(token))
    return result_sentence

In [17]:
df_train["text"] = df_train.text.map(Lemmatize)
df_train.text.head()

0    [deed, reason, earthquake, may, allah, forgive...
1        [forest, fire, near, la, ronge, sask, canada]
2    [resident, asked, shelter, place, notified, of...
3    [13000, people, receive, wildfire, evacuation,...
4    [got, sent, photo, ruby, alaska, smoke, wildfi...
Name: text, dtype: object

## Calculate Probabilty with N-grams

In [18]:
from nltk.util import ngrams

#### Calculate tokens frequencies

In [42]:
# Flatten the token as putting them all in single list
def flatten_tokens(df):
    tokens = [token for item in df for token in item]      
    return tokens

In [43]:
tokens = flatten_tokens(df_train.text)
tokens_fd = nltk.FreqDist(tokens)
tokens_fd.most_common(10)

[('fire', 350),
 ('like', 347),
 ('im', 299),
 ('amp', 298),
 ('get', 255),
 ('u', 246),
 ('new', 224),
 ('via', 220),
 ('one', 205),
 ('people', 199)]

#### Get Bigrams and calculate their frequencies

In [21]:
bigrams = df_train.text.apply(lambda x:list(ngrams(x, 2)))
bigrams[:5]

0    [(deed, reason), (reason, earthquake), (earthq...
1    [(forest, fire), (fire, near), (near, la), (la...
2    [(resident, asked), (asked, shelter), (shelter...
3    [(13000, people), (people, receive), (receive,...
4    [(got, sent), (sent, photo), (photo, ruby), (r...
Name: text, dtype: object

In [40]:
# Flatten the bigrams as putting them all in single list
def flatten_bigrams(df):
    bigrams = [bigram for item in df for bigram in item]
    return bigrams

In [41]:
len(flatten_bigrams(bigrams))

64158

In [23]:
fd_bi = nltk.FreqDist(get_all_bigrams(bigrams))
fd_bi.most_common(10)

[(('suicide', 'bomber'), 60),
 (('burning', 'building'), 58),
 (('look', 'like'), 49),
 (('body', 'bag'), 48),
 (('gon', 'na'), 43),
 (('youtube', 'video'), 43),
 (('liked', 'youtube'), 42),
 (('northern', 'california'), 41),
 (('cross', 'body'), 39),
 (('oil', 'spill'), 39)]

### Estimating the probabiltiy of a word sequence
P(x1, x2, ..., xn) = P(x1)P(x2|x1)...P(xn|x1,...xn-1)

In [24]:
def prob_calc(sentence):
    probs = [tokens_fd.freq(sentence[0][0])] # prob of first token
    print(f"P({sentence[0][0]}) = {probs[0]:.3}")
    for bigram in sentence:
        prob = fd_bi.freq(bigram) / tokens_fd.freq(bigram[0])
        probs.append(prob)
        print(f"P({bigram[0]} | {bigram[1]}) = {prob:.3}")
    print("="*40)
    
    res = probs[0] 
    sen = " ".join([word[0] for word in sentence])
    print(f"P({sen}) = ({probs[0]:.4})",end="")
    for prob in probs[1:]:
        res *= prob
        print(f" * ({prob:.4})", end= "")
    print(f" = {res:.2}")
    

In [25]:
for bigram in bigrams[0:10]:
    prob_calc(bigram)
    print("\n")

P(deed) = 2.79e-05
P(deed | reason) = 0.559
P(reason | earthquake) = 0.0361
P(earthquake | may) = 0.0211
P(may | allah) = 0.0381
P(allah | forgive) = 0.124
P(forgive | u) = 0.559
P(deed reason earthquake may allah forgive) = (2.787e-05) * (0.5593) * (0.03609) * (0.02111) * (0.03814) * (0.1243) * (0.5593) = 3.1e-11


P(forest) = 0.00092
P(forest | fire) = 0.475
P(fire | near) = 0.0128
P(near | la) = 0.0207
P(la | ronge) = 0.04
P(ronge | sask) = 1.12
P(sask | canada) = 1.12
P(forest fire near la ronge sask) = (0.0009196) * (0.4746) * (0.01278) * (0.02072) * (0.03995) * (1.119) * (1.119) = 5.8e-09


P(resident) = 0.000111
P(resident | asked) = 0.14
P(asked | shelter) = 0.124
P(shelter | place) = 0.373
P(place | notified) = 0.0361
P(notified | officer) = 1.12
P(officer | evacuation) = 0.0302
P(evacuation | shelter) = 0.0215
P(shelter | place) = 0.373
P(place | order) = 0.0361
P(order | expected) = 0.032
P(resident asked shelter place notified officer evacuation shelter place order) = (0.00