In [1]:
from nltk.util import ngrams
from utils.preprocessor import Preprocessor
import pandas as pd
import nltk
import warnings
warnings.filterwarnings("ignore")

## Preprocess the Data

In [2]:
df_train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')

In [3]:
preprocessor = Preprocessor()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df_train['text'] = preprocessor(df_train.text)
df_train['text']

0       [deed, reason, earthquake, may, allah, forgive...
1           [forest, fire, near, la, ronge, sask, canada]
2       [resident, asked, shelter, place, notified, of...
3       [13000, people, receive, wildfire, evacuation,...
4       [got, sent, photo, ruby, alaska, smoke, wildfi...
                              ...                        
7608    [two, giant, crane, holding, bridge, collapse,...
7609    [ariaahrary, thetawniest, control, wild, fire,...
7610                [m194, 0104, utc5km, volcano, hawaii]
7611    [police, investigating, ebike, collided, car, ...
7612    [latest, home, razed, northern, california, wi...
Name: text, Length: 7613, dtype: object

## Calculate Probabilty with N-grams

### Adding start and end tokens

In [5]:
df_train['text'] = df_train.text.map(lambda x: ["<S>"] + x + ["<E>"])
df_train.text.head()

0    [<S>, deed, reason, earthquake, may, allah, fo...
1    [<S>, forest, fire, near, la, ronge, sask, can...
2    [<S>, resident, asked, shelter, place, notifie...
3    [<S>, 13000, people, receive, wildfire, evacua...
4    [<S>, got, sent, photo, ruby, alaska, smoke, w...
Name: text, dtype: object

#### Calculate tokens frequencies

In [6]:
# Flatten the token as putting them all in single list
def flatten_tokens(df):
    tokens = [token for item in df for token in item]      
    return tokens

In [7]:
tokens = flatten_tokens(df_train.text)
tokens_fd = nltk.FreqDist(tokens)
tokens_fd.most_common(10)

[('<S>', 7613),
 ('<E>', 7613),
 ('fire', 350),
 ('like', 347),
 ('im', 299),
 ('amp', 298),
 ('get', 255),
 ('u', 246),
 ('new', 224),
 ('via', 220)]

#### Get Bigrams and calculate their frequencies

In [8]:
bigrams = df_train.text.apply(lambda x:list(ngrams(x, 2)))
bigrams[:5]

0    [(<S>, deed), (deed, reason), (reason, earthqu...
1    [(<S>, forest), (forest, fire), (fire, near), ...
2    [(<S>, resident), (resident, asked), (asked, s...
3    [(<S>, 13000), (13000, people), (people, recei...
4    [(<S>, got), (got, sent), (sent, photo), (phot...
Name: text, dtype: object

In [9]:
# Flatten the bigrams as putting them all in single list
def flatten_bigrams(df):
    bigrams = [bigram for item in df for bigram in item]
    return bigrams

In [10]:
flattened = flatten_bigrams(bigrams)
len(flatten_bigrams(bigrams))

79384

In [11]:
fd_bi = nltk.FreqDist(flattened)
fd_bi.most_common(10)

[(('<S>', 'new'), 74),
 (('<S>', 'im'), 70),
 (('suicide', 'bomber'), 60),
 (('burning', 'building'), 58),
 (('fire', '<E>'), 58),
 (('news', '<E>'), 53),
 (('\x89û', '<E>'), 50),
 (('look', 'like'), 49),
 (('body', 'bag'), 48),
 (('<S>', 'rt'), 47)]

### Estimating the probabiltiy of a word sequence
P(x1, x2, ..., xn) = P(x1)P(x2|x1)...P(xn|x1,...xn-1)

In [12]:
def prob_calc(sentence, verbose = False):
    probs = []
    for bigram in sentence:
        # Probabilty of current bigram = frequency(bigram) / (frequency(first token of bigram) == number of words prefix to second token of bigram)
        prob = fd_bi[bigram] / tokens_fd[bigram[0]] 
        probs.append(prob)
        if verbose:
            print(f"P({bigram[0]} | {bigram[1]}) = {prob:.3}")
    if verbose:
        print("="*40)
    
    res = probs[0] 
    sen = " ".join([word[0] for word in sentence])
    if verbose:
        print(f"P({sen}) = ({probs[0]:.4})",end="")
    for prob in probs[1:]:
        res *= prob
        if verbose:
            print(f" * ({prob:.4})", end= "")
    if verbose:
        print(f" = {res:.2}")
    return res

In [13]:
probs = []
for bigram in bigrams:
    probs.append(prob_calc(bigram))

In [14]:
df_train['prob_word_seq'] = probs
df_train.head(10)

Unnamed: 0,id,keyword,location,text,target,prob_word_seq
0,1,,,"[<S>, deed, reason, earthquake, may, allah, fo...",1,6.155135e-12
1,4,,,"[<S>, forest, fire, near, la, ronge, sask, can...",1,4.860113e-10
2,5,,,"[<S>, resident, asked, shelter, place, notifie...",1,1.87942e-15
3,6,,,"[<S>, 13000, people, receive, wildfire, evacua...",1,6.319668e-13
4,7,,,"[<S>, got, sent, photo, ruby, alaska, smoke, w...",1,4.566091e-14
5,8,,,"[<S>, rockyfire, update, california, hwy, 20, ...",1,4.370935e-21
6,10,,,"[<S>, flood, disaster, heavy, rain, cause, fla...",1,4.122996e-18
7,13,,,"[<S>, im, top, hill, see, fire, wood, <E>]",1,5.836946e-13
8,14,,,"[<S>, there, emergency, evacuation, happening,...",1,6.344293e-14
9,15,,,"[<S>, im, afraid, tornado, coming, area, <E>]",1,6.484445e-10


### Print the likely hood of the first 10 senteces

In [15]:
for bigram in bigrams:
    prob_calc(bigram, verbose = True)
    print("\n")

P(<S> | deed) = 0.000131
P(deed | reason) = 0.5
P(reason | earthquake) = 0.0323
P(earthquake | may) = 0.0189
P(may | allah) = 0.0341
P(allah | forgive) = 0.111
P(forgive | u) = 0.5
P(u | <E>) = 0.0813
P(<S> deed reason earthquake may allah forgive u) = (0.0001314) * (0.5) * (0.03226) * (0.01887) * (0.03409) * (0.1111) * (0.5) * (0.0813) = 6.2e-12


P(<S> | forest) = 0.000657
P(forest | fire) = 0.424
P(fire | near) = 0.0114
P(near | la) = 0.0185
P(la | ronge) = 0.0357
P(ronge | sask) = 1.0
P(sask | canada) = 1.0
P(canada | <E>) = 0.231
P(<S> forest fire near la ronge sask canada) = (0.0006568) * (0.4242) * (0.01143) * (0.01852) * (0.03571) * (1.0) * (1.0) * (0.2308) = 4.9e-10


P(<S> | resident) = 0.000394
P(resident | asked) = 0.125
P(asked | shelter) = 0.111
P(shelter | place) = 0.333
P(place | notified) = 0.0323
P(notified | officer) = 1.0
P(officer | evacuation) = 0.027
P(evacuation | shelter) = 0.0192
P(shelter | place) = 0.333
P(place | order) = 0.0323
P(order | expected) = 0.0286