In [34]:
import re
import string

from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize 

# import nltk
# nltk.download('punkt')

In [13]:
"""
File that contain all the util functions for data preprocessing and more
"""
import pandas as pd 


DATA_PATH = './data/'

def read_data():
    """
    :return x_train, y_train, x_test: return the data from the CSV
    """
    X_train = pd.read_csv(DATA_PATH+'train.csv', index_col= 0)
    x_train = pd.DataFrame(X_train.text)
    y_train = pd.DataFrame(X_train.target)
    
    X_test = pd.read_csv(DATA_PATH+'test.csv', index_col= 0)
    x_test = pd.DataFrame(X_test.text)
    
    return x_train, y_train, x_test

In [31]:
x_train, y_train, x_test = read_data()

In [32]:
x_train

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1,Our Deeds are the Reason of this #earthquake M...
4,Forest fire near La Ronge Sask. Canada
5,All residents asked to 'shelter in place' are ...
6,"13,000 people receive #wildfires evacuation or..."
7,Just got sent this photo from Ruby #Alaska as ...
...,...
10869,Two giant cranes holding a bridge collapse int...
10870,@aria_ahrary @TheTawniest The out of control w...
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
10872,Police investigating after an e-bike collided ...


In [70]:
x_train, y_train, x_test = read_data()
def preprocess(df):
    """
    :param DataFrame: Data frame to preprocess
    :return Data frame: Data frame with preprocessed data
    """
    stop_words = set(stopwords.words('english')) 
    def stopwords_p(w):
        l = []
        for i in w:
            if i in stop_words:
                pass
            else:
                # Removing puntuation
                i = "".join([char for char in i if char not in string.punctuation])
                i = i.lower()
                l.append(i)
        return l
                
    df['text'] = df['text'].apply(lambda w: word_tokenize(w))
    df['text'] = df['text'].apply(lambda w: stopwords_p(w))
    return df
X_train = preprocess(x_train)
X_test = preprocess(x_test)

In [71]:
X_train

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1,"[our, deeds, reason, , earthquake, may, allah,..."
4,"[forest, fire, near, la, ronge, sask, , canada]"
5,"[all, residents, asked, shelter, place, , noti..."
6,"[13000, people, receive, , wildfires, evacuati..."
7,"[just, got, sent, photo, ruby, , alaska, smoke..."
...,...
10869,"[two, giant, cranes, holding, bridge, collapse..."
10870,"[, ariaahrary, , thetawniest, the, control, wi..."
10871,"[m194, , 0104, utc, , , 5km, s, volcano, hawai..."
10872,"[police, investigating, ebike, collided, car, ..."


In [72]:
X_test

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
0,"[just, happened, terrible, car, crash]"
2,"[heard, , earthquake, different, cities, , sta..."
3,"[forest, fire, spot, pond, , geese, fleeing, a..."
9,"[apocalypse, lighting, , , spokane, , wildfires]"
11,"[typhoon, soudelor, kills, 28, china, taiwan]"
...,...
10861,"[earthquake, safety, los, angeles, ûò, safety..."
10865,"[storm, ri, worse, last, hurricane, , my, city..."
10868,"[green, line, derailment, chicago, http, , tco..."
10874,"[meg, issues, hazardous, weather, outlook, , h..."


<h1>Model generation</h1>

In [43]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [73]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train.text)]
model = Doc2Vec(documents, vector_size=128, window=2, min_count=1, workers=4)

In [74]:
# model.save('model_gensim')
model.infer_vector(['Forest', 'fire'])

array([-5.31083439e-04, -3.47237452e-03, -1.81245501e-03, -3.29910754e-03,
        3.20520415e-03, -1.83071103e-03,  1.63253979e-03,  2.59165769e-03,
        1.07914489e-03, -1.15226234e-04, -2.63314927e-03,  3.32772336e-03,
        2.19855621e-03,  3.22562340e-03, -8.63855064e-04,  2.97020725e-03,
        4.77295229e-03,  2.95861601e-03, -1.81688310e-03, -2.29791133e-03,
        2.30206596e-03,  4.04450810e-03,  3.20339738e-03, -3.89711699e-03,
       -1.99540448e-03,  9.27370624e-04, -2.99169193e-03, -2.56610406e-03,
        3.21050966e-03, -3.71962856e-03,  4.42132819e-03, -6.65751635e-04,
        5.30018122e-04,  4.03984636e-03,  1.30636850e-03,  2.67045409e-03,
       -1.97954476e-03,  8.59163847e-05,  1.80690549e-03,  2.48648133e-03,
        1.46912085e-03, -2.33582291e-03, -4.07175976e-04,  4.37115319e-04,
       -9.99105745e-04, -4.08000173e-03,  1.13466452e-03, -5.24787116e-04,
        1.00491347e-03, -2.46716756e-03, -3.51842633e-03, -1.36427247e-04,
        7.19917531e-04,  

In [75]:
X_train['x'] = X_train['text'].apply(lambda x: model.infer_vector(x))
X_train

Unnamed: 0_level_0,text,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[our, deeds, reason, , earthquake, may, allah,...","[0.022450453, -0.014068791, 0.02320005, 0.0056..."
4,"[forest, fire, near, la, ronge, sask, , canada]","[0.014047214, -0.011620407, 0.022000136, 0.007..."
5,"[all, residents, asked, shelter, place, , noti...","[0.050940637, -0.034008563, 0.05417287, 0.0113..."
6,"[13000, people, receive, , wildfires, evacuati...","[0.021716835, -0.014088228, 0.030080479, 0.009..."
7,"[just, got, sent, photo, ruby, , alaska, smoke...","[0.028881235, -0.025734458, 0.036264557, 0.011..."
...,...,...
10869,"[two, giant, cranes, holding, bridge, collapse...","[0.01230052, -0.011069448, 0.0139979655, 0.006..."
10870,"[, ariaahrary, , thetawniest, the, control, wi...","[0.024502154, -0.020831138, 0.027590899, 0.010..."
10871,"[m194, , 0104, utc, , , 5km, s, volcano, hawai...","[0.0273669, -0.0153999645, 0.026747, 0.0034684..."
10872,"[police, investigating, ebike, collided, car, ...","[0.045911968, -0.034719035, 0.05424538, 0.0084..."


In [76]:
X_test['x'] = X_test['text'].apply(lambda x: model.infer_vector(x))
X_test

Unnamed: 0_level_0,text,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[just, happened, terrible, car, crash]","[0.005252943, -0.0033986638, 0.013179226, 0.00..."
2,"[heard, , earthquake, different, cities, , sta...","[0.016773919, -0.014460153, 0.022829268, 0.008..."
3,"[forest, fire, spot, pond, , geese, fleeing, a...","[0.010359415, -0.0069285394, 0.016006121, 0.00..."
9,"[apocalypse, lighting, , , spokane, , wildfires]","[0.03452758, -0.02148733, 0.037540324, 0.00732..."
11,"[typhoon, soudelor, kills, 28, china, taiwan]","[0.023359133, -0.014642485, 0.029015407, 0.005..."
...,...,...
10861,"[earthquake, safety, los, angeles, ûò, safety...","[0.019650364, -0.015222559, 0.027727272, 0.005..."
10865,"[storm, ri, worse, last, hurricane, , my, city...","[0.0056765066, -0.00045179916, 0.009486522, 0...."
10868,"[green, line, derailment, chicago, http, , tco...","[0.0007466636, -0.0023661624, -0.0018179565, 0..."
10874,"[meg, issues, hazardous, weather, outlook, , h...","[0.020138718, -0.013476475, 0.017437631, 0.002..."


In [93]:
x_train = X_train.x.to_list()
x_test = X_test.x.to_list()

y_train = y_train.target.to_list()

In [81]:
# x_test[0:2]

<h1>Starting with the SVM</h1>

In [82]:
from sklearn import svm

model = svm.SVC()

In [88]:
model.fit(x_train, y_train)

SVC()

In [98]:
item = x_train[3]
label = y_train[3]
print(label)

1


In [99]:
model.predict([item])

array([0])

In [None]:
def stats(model, X,y):
    preds = []
    for item in X:
        preds.append(model.predict([item]))
    
    TP=0; FP=0; TN=0; FN=0
    for i in range(len(preds)):
        if preds[i] == 1 and y[i] == 1:
            TP += 1
        elif preds[i] == 0 and y[i] == 0:
            TN += 1
        elif preds[i] == 0 and y[i] == 1:
            FN += 1
        elif preds[i] == 1 and y[i] == 1:
            FP += 1
    acc = (TP+TN) / (TP+TN+FP+FN) if (TP+TN+FP+FN) > 0 else 0
    rec = (TP) / (TP+FN) if (TP+FN) > 0 else 0
    pre = (TP) / (TP+FP) if (TP+FP) > 0 else 0
    
    