In [123]:
import re
import string

from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize 

# %pip install textblob
from textblob import TextBlob

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# import nltk
# nltk.download('punkt')

In [32]:
"""
File that contain all the util functions for data preprocessing and more
"""
import pandas as pd 


DATA_PATH = './data/'

X = pd.read_csv(DATA_PATH+'train.csv', index_col= 0)
Y = pd.read_csv(DATA_PATH+'test.csv', index_col= 0)

In [33]:
X.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [34]:
Y.head()

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [35]:
def preprocess(df):
    """
    :param DataFrame: Data frame to preprocess
    :return Data frame: Data frame with preprocessed data
    """
    stop_words = set(stopwords.words('english')) 
    def stopwords_p(w):
        l = []
        for i in w:
            if i in stop_words:
                pass
            else:
                # Removing puntuation
                #i = "".join([char for char in i if char not in string.punctuation])
                i = re.sub('[^a-zA-Z0-9]', '', i)
                i = i.lower()
                if len(i) > 0:
                    l.append(i)
        return l
                
    df['text_prep'] = df['text'].apply(lambda w: word_tokenize(w))
    df['text_prep'] = df['text_prep'].apply(lambda w: stopwords_p(w))
    return df
train = preprocess(X)
test = preprocess(Y)

In [36]:
train = shuffle(train)
train.head()

Unnamed: 0_level_0,keyword,location,text,target,text_prep
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7761,police,,Police expand search for missing pregnant woma...,0,"[police, expand, search, missing, pregnant, wo..."
7670,panic,"Palm Bay, FL (Kissimmee)",Panic over: Patient in Alabama tests negative ...,1,"[panic, patient, alabama, tests, negative, ebo..."
583,arson,EARTH,Owner of Chicago-Area Gay Bar Admits to Arson ...,0,"[owner, chicagoarea, gay, bar, admits, arson, ..."
7689,panic,Narnia,I added a video to a @YouTube playlist http://...,0,"[i, added, video, youtube, playlist, http, tco..."
9888,traumatised,Londonstan,ÛÏ@_keits: @LIVA_GOTTA get a gold chain you'l...,0,"[keits, livagotta, get, gold, chain, ll, under..."


In [37]:
test.head()

Unnamed: 0_level_0,keyword,location,text,text_prep
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,Just happened a terrible car crash,"[just, happened, terrible, car, crash]"
2,,,"Heard about #earthquake is different cities, s...","[heard, earthquake, different, cities, stay, s..."
3,,,"there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond, geese, fleeing, acr..."
9,,,Apocalypse lighting. #Spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]"
11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, china, taiwan]"


<h1>Model generation</h1>

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [21]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train.text)]
model = Doc2Vec(documents, vector_size=256, window=4, min_count=0, workers=4)

In [38]:
train['x'] = train['text_prep'].apply(lambda x: model.infer_vector(x))
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['x'] = train['text_prep'].apply(lambda x: model.infer_vector(x))


Unnamed: 0_level_0,keyword,location,text,target,text_prep,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7761,police,,Police expand search for missing pregnant woma...,0,"[police, expand, search, missing, pregnant, wo...","[-0.007308758, 0.005901744, 0.0036629573, 0.00..."
7670,panic,"Palm Bay, FL (Kissimmee)",Panic over: Patient in Alabama tests negative ...,1,"[panic, patient, alabama, tests, negative, ebo...","[-0.019752247, 0.014266182, 0.010357015, 0.009..."
583,arson,EARTH,Owner of Chicago-Area Gay Bar Admits to Arson ...,0,"[owner, chicagoarea, gay, bar, admits, arson, ...","[-0.010665309, 0.010349484, 0.00755486, 0.0057..."
7689,panic,Narnia,I added a video to a @YouTube playlist http://...,0,"[i, added, video, youtube, playlist, http, tco...","[-0.0047078, 0.004301085, 0.004400423, 0.00467..."
9888,traumatised,Londonstan,ÛÏ@_keits: @LIVA_GOTTA get a gold chain you'l...,0,"[keits, livagotta, get, gold, chain, ll, under...","[-0.011215143, 0.008989504, 0.0096160155, 0.00..."


In [40]:
ind_train = train.index.to_list()
ind_train

[7761,
 7670,
 583,
 7689,
 9888,
 4682,
 4715,
 3487,
 3281,
 744,
 314,
 1989,
 6961,
 8708,
 8297,
 3566,
 8009,
 5560,
 4700,
 6466,
 5908,
 6933,
 298,
 5445,
 7463,
 2687,
 8394,
 1201,
 9062,
 5092,
 3643,
 7356,
 10385,
 9701,
 4086,
 5884,
 8938,
 3899,
 5085,
 2941,
 9732,
 9387,
 4849,
 3980,
 361,
 7934,
 2963,
 1613,
 4499,
 2414,
 7372,
 8966,
 7129,
 7285,
 2793,
 4430,
 5222,
 6841,
 3841,
 8606,
 5412,
 625,
 870,
 8135,
 7517,
 4457,
 4688,
 9221,
 4276,
 2998,
 2158,
 5315,
 6268,
 9225,
 9089,
 7456,
 6565,
 3982,
 3543,
 9806,
 407,
 8106,
 2517,
 7080,
 4641,
 6208,
 3125,
 6737,
 687,
 6811,
 527,
 9731,
 10415,
 5018,
 380,
 1709,
 2792,
 9595,
 1514,
 9047,
 8817,
 6631,
 5673,
 9435,
 53,
 638,
 6956,
 3036,
 3145,
 4948,
 568,
 8767,
 9395,
 1350,
 5032,
 9087,
 6171,
 982,
 5098,
 249,
 7821,
 849,
 6134,
 2555,
 3499,
 9669,
 10331,
 1590,
 2970,
 316,
 1051,
 1052,
 10448,
 4550,
 1181,
 451,
 6138,
 9016,
 5525,
 6597,
 5575,
 3162,
 1990,
 4934,
 7943,
 

In [41]:
test['x'] = test['text_prep'].apply(lambda x: model.infer_vector(x))
test

Unnamed: 0_level_0,keyword,location,text,text_prep,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,,,Just happened a terrible car crash,"[just, happened, terrible, car, crash]","[-0.0053129955, 0.005087275, 0.0034982103, 0.0..."
2,,,"Heard about #earthquake is different cities, s...","[heard, earthquake, different, cities, stay, s...","[-0.010875889, 0.0075846906, 0.005635406, 0.00..."
3,,,"there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond, geese, fleeing, acr...","[-0.0034923425, 0.0014190368, 0.0035825502, 0...."
9,,,Apocalypse lighting. #Spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]","[-0.003320934, 0.003930992, 0.0015746179, 0.00..."
11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, china, taiwan]","[-0.009974931, 0.0058606705, 0.00433252, 0.006..."
...,...,...,...,...,...
10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,"[earthquake, safety, los, angeles, safety, fas...","[-0.0026034634, 0.0032635338, 0.00012493004, -..."
10865,,,Storm in RI worse than last hurricane. My city...,"[storm, ri, worse, last, hurricane, my, city, ...","[-0.015165645, 0.013136677, 0.009227871, 0.007..."
10868,,,Green Line derailment in Chicago http://t.co/U...,"[green, line, derailment, chicago, http, tcout...","[-0.0071160384, 0.0038670176, 0.0033742343, 0...."
10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,"[meg, issues, hazardous, weather, outlook, hwo...","[-0.002654015, 0.0022429759, 0.0034386967, 0.0..."


In [42]:
train.head()

Unnamed: 0_level_0,keyword,location,text,target,text_prep,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7761,police,,Police expand search for missing pregnant woma...,0,"[police, expand, search, missing, pregnant, wo...","[-0.007308758, 0.005901744, 0.0036629573, 0.00..."
7670,panic,"Palm Bay, FL (Kissimmee)",Panic over: Patient in Alabama tests negative ...,1,"[panic, patient, alabama, tests, negative, ebo...","[-0.019752247, 0.014266182, 0.010357015, 0.009..."
583,arson,EARTH,Owner of Chicago-Area Gay Bar Admits to Arson ...,0,"[owner, chicagoarea, gay, bar, admits, arson, ...","[-0.010665309, 0.010349484, 0.00755486, 0.0057..."
7689,panic,Narnia,I added a video to a @YouTube playlist http://...,0,"[i, added, video, youtube, playlist, http, tco...","[-0.0047078, 0.004301085, 0.004400423, 0.00467..."
9888,traumatised,Londonstan,ÛÏ@_keits: @LIVA_GOTTA get a gold chain you'l...,0,"[keits, livagotta, get, gold, chain, ll, under...","[-0.011215143, 0.008989504, 0.0096160155, 0.00..."


In [43]:
train.shape[0]

7613

In [44]:
# x_test[0:2]
train.target.unique()

array([0, 1], dtype=int64)

<h1>Starting with the SVM</h1>

In [330]:
from sklearn import svm

#model = svm.SVC(kernel= 'poly', gamma= 3, random_state= 0, C= 0.8)
model = svm.SVC()

In [331]:
model.fit(train.x.to_list(), train.target.to_list())

SVC()

In [332]:
item = train.x[10]
label = train.target[10]
print(label)

1


In [333]:
model.predict([item])
#model.classes_

array([0])

In [11]:
def stats(model, X,y):
    preds = model.predict(X)
    TP=0; FP=0; TN=0; FN=0

    for i in range(len(preds)):
        if preds[i] == 1 and y[i] == 1:
            TP += 1
        elif preds[i] == 0 and y[i] == 0:
            TN += 1
        elif preds[i] == 0 and y[i] == 1:
            FN += 1
        elif preds[i] == 1 and y[i] == 0:
            FP += 1
            
    acc = (TP+TN) / (TP+TN+FP+FN) if (TP+TN+FP+FN) > 0 else 0
    rec = (TP) / (TP+FN) if (TP+FN) > 0 else 0
    pre = (TP) / (TP+FP) if (TP+FP) > 0 else 0
    print('='*26)
    print('|  acc   |  rec  |  pre  |')
    print('-'*26)
    print(f"| {round(acc*100, 2)}% | {round(rec*100, 2)}% | {round(pre*100, 2)}% |")
    print('='*26)
    return TP, FP, TN, FN

#stats(model, train.x.to_list(), train.target.to_list())

In [12]:
def compare_stats(preds, y):
    TP=0; FP=0; TN=0; FN=0

    for i in range(len(preds)):
        if preds[i] == 1 and y[i] == 1:
            TP += 1
        elif preds[i] == 0 and y[i] == 0:
            TN += 1
        elif preds[i] == 0 and y[i] == 1:
            FN += 1
        elif preds[i] == 1 and y[i] == 0:
            FP += 1
            
    acc = (TP+TN) / (TP+TN+FP+FN) if (TP+TN+FP+FN) > 0 else 0
    rec = (TP) / (TP+FN) if (TP+FN) > 0 else 0
    pre = (TP) / (TP+FP) if (TP+FP) > 0 else 0
    print('='*26)
    print('|  acc   |  rec  |  pre  |')
    print('-'*26)
    print(f"| {round(acc*100, 2)}% | {round(rec*100, 2)}% | {round(pre*100, 2)}% |")
    print('='*26)
    return TP, FP, TN, FN

# stats(model, train.x.to_list(), train.target.to_list())

In [336]:
set(model.predict(train.x.to_list()))

{0, 1}

In [337]:
pred = model.predict(test.x.to_list())

In [338]:
set(pred)

{0}

In [340]:
a = pred.tolist()
pd.DataFrame(data={'id': test.index.to_list(), 'target': a}).to_csv('submission_SVM_doc2vec.csv', index=False)

<h1>Feature Engineering</h1>

In [45]:
train.head()

Unnamed: 0_level_0,keyword,location,text,target,text_prep,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7761,police,,Police expand search for missing pregnant woma...,0,"[police, expand, search, missing, pregnant, wo...","[-0.007308758, 0.005901744, 0.0036629573, 0.00..."
7670,panic,"Palm Bay, FL (Kissimmee)",Panic over: Patient in Alabama tests negative ...,1,"[panic, patient, alabama, tests, negative, ebo...","[-0.019752247, 0.014266182, 0.010357015, 0.009..."
583,arson,EARTH,Owner of Chicago-Area Gay Bar Admits to Arson ...,0,"[owner, chicagoarea, gay, bar, admits, arson, ...","[-0.010665309, 0.010349484, 0.00755486, 0.0057..."
7689,panic,Narnia,I added a video to a @YouTube playlist http://...,0,"[i, added, video, youtube, playlist, http, tco...","[-0.0047078, 0.004301085, 0.004400423, 0.00467..."
9888,traumatised,Londonstan,ÛÏ@_keits: @LIVA_GOTTA get a gold chain you'l...,0,"[keits, livagotta, get, gold, chain, ll, under...","[-0.011215143, 0.008989504, 0.0096160155, 0.00..."


In [50]:
train['polarity'] = train['text'].apply(lambda text: TextBlob(text).sentiment.polarity)
train['subjectivity'] = train['text'].apply(lambda text: TextBlob(text).sentiment.subjectivity)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['polarity'] = train['text'].apply(lambda text: TextBlob(text).sentiment.polarity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['subjectivity'] = train['text'].apply(lambda text: TextBlob(text).sentiment.subjectivity)


Unnamed: 0_level_0,keyword,location,text,target,text_prep,x,sentiment,polarity,subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7761,police,,Police expand search for missing pregnant woma...,0,"[police, expand, search, missing, pregnant, wo...","[-0.007308758, 0.005901744, 0.0036629573, 0.00...","(0.06666666666666665, 0.275)",0.066667,0.275
7670,panic,"Palm Bay, FL (Kissimmee)",Panic over: Patient in Alabama tests negative ...,1,"[panic, patient, alabama, tests, negative, ebo...","[-0.019752247, 0.014266182, 0.010357015, 0.009...","(-0.3, 0.4)",-0.3,0.4
583,arson,EARTH,Owner of Chicago-Area Gay Bar Admits to Arson ...,0,"[owner, chicagoarea, gay, bar, admits, arson, ...","[-0.010665309, 0.010349484, 0.00755486, 0.0057...","(0.4166666666666667, 0.5833333333333334)",0.416667,0.583333
7689,panic,Narnia,I added a video to a @YouTube playlist http://...,0,"[i, added, video, youtube, playlist, http, tco...","[-0.0047078, 0.004301085, 0.004400423, 0.00467...","(0.35, 0.55)",0.35,0.55
9888,traumatised,Londonstan,ÛÏ@_keits: @LIVA_GOTTA get a gold chain you'l...,0,"[keits, livagotta, get, gold, chain, ll, under...","[-0.011215143, 0.008989504, 0.0096160155, 0.00...","(-0.2, 0.3)",-0.2,0.3


<h1>SVM + BoW</h1>

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC

In [52]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svm', SVC()),
])

In [140]:
X_train = pd.read_csv(DATA_PATH+'train.csv', index_col= 0)
X_test = pd.read_csv(DATA_PATH+'test.csv', index_col= 0)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train.text, X_train.target, test_size=0.10)

In [149]:
count_vect = CountVectorizer(max_features= 4400)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(6851, 4400)

In [150]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6851, 4400)

In [151]:
X_train_tfidf[0]

<1x4400 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [152]:
from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB().fit(X_train_tfidf, X_train.target)
clf = SVC().fit(X_train_tfidf, Y_train)

In [162]:
# validation
count_vect = CountVectorizer(max_features= 4400)
X_valid_counts = count_vect.fit_transform(X_valid)
tfidf_transformer = TfidfTransformer()
X_valid_tfidf = tfidf_transformer.fit_transform(X_valid_counts)
X_valid_tfidf.shape

(762, 4400)

In [163]:
pred_svm = clf.predict(X_valid_tfidf)

In [164]:
set(pred_svm)

{0, 1}

In [165]:
compare_stats(pred_svm,Y_valid.to_list())

|  acc   |  rec  |  pre  |
--------------------------
| 52.36% | 8.1% | 46.03% |


(29, 34, 370, 329)

In [166]:
X_test

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10865,,,Storm in RI worse than last hurricane. My city...
10868,,,Green Line derailment in Chicago http://t.co/U...
10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [167]:
ind = X_test.index.to_list()
ind_t = X_train.index.to_list()

In [168]:
docs_new = X_test.text.to_list()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

pred = clf.predict(X_new_tfidf)
pred

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [169]:
pred_svm_t = pred.tolist()

In [170]:
solution = pd.DataFrame(data= {'id': ind, 'target': pred_svm_t})

In [171]:
solution.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [172]:
#solution.to_csv('submission_SVC.csv',index= False)

<h1>SVM + BoW + Including Sentiment features</h1>

In [173]:
# pipelining the predictions in one svm
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import numpy as np

In [174]:
train.head(3)

Unnamed: 0_level_0,keyword,location,text,target,text_prep,x,sentiment,polarity,subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7761,police,,Police expand search for missing pregnant woma...,0,"[police, expand, search, missing, pregnant, wo...","[-0.007308758, 0.005901744, 0.0036629573, 0.00...","(0.06666666666666665, 0.275)",0.066667,0.275
7670,panic,"Palm Bay, FL (Kissimmee)",Panic over: Patient in Alabama tests negative ...,1,"[panic, patient, alabama, tests, negative, ebo...","[-0.019752247, 0.014266182, 0.010357015, 0.009...","(-0.3, 0.4)",-0.3,0.4
583,arson,EARTH,Owner of Chicago-Area Gay Bar Admits to Arson ...,0,"[owner, chicagoarea, gay, bar, admits, arson, ...","[-0.010665309, 0.010349484, 0.00755486, 0.0057...","(0.4166666666666667, 0.5833333333333334)",0.416667,0.583333


In [175]:
# Classifier data
svm_train = train.text
svm_train_y = train.target.to_list()

svm_train, svm_valid, svm_train_y, svm_valid_y = train_test_split(svm_train, svm_train_y,test_size=0.10)

# New features
svm_polarity = np.array(train.polarity.to_list())
svm_subjectivity = np.array(train.subjectivity.to_list())

In [176]:
len(svm_valid)

762

In [193]:
def get_polarity(x):
    return np.array([TextBlob(t).sentiment.polarity for t in x]).reshape(-1, 1)

def get_subjectivity(x):
    return np.array([TextBlob(t).sentiment.subjectivity for t in x]).reshape(-1, 1)

svm_classifier = Pipeline([
    ('features', FeatureUnion([
        ('pola', Pipeline([
            ('polarity', FunctionTransformer(get_polarity, validate=False))
        ])),
        ('subj', Pipeline([
            ('subjectivity', FunctionTransformer(get_subjectivity, validate=False))
        ])),
        
        ('text', Pipeline([
            ('vect', CountVectorizer(max_features= 12000)),
            ('tfidf', TfidfTransformer()),
        ])),
    ])),
    ('svm', SVC(C=0.8))
])

In [194]:
svm_classifier.fit(svm_train, svm_train_y)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('pola',
                                                 Pipeline(steps=[('polarity',
                                                                  FunctionTransformer(func=<function get_polarity at 0x00000225AD64C430>))])),
                                                ('subj',
                                                 Pipeline(steps=[('subjectivity',
                                                                  FunctionTransformer(func=<function get_subjectivity at 0x00000225AD64C040>))])),
                                                ('text',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(max_features=12000)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())]))])),
    

In [201]:
k = svm_classifier.predict(svm_valid)

In [202]:
compare_stats(k, svm_valid_y)

|  acc   |  rec  |  pre  |
--------------------------
| 80.58% | 66.04% | 84.0% |


(210, 40, 404, 108)

<h1>XGBoost</h1>

In [348]:
#%pip install xgboost
from xgboost import XGBClassifier

In [349]:
X_train = pd.read_csv(DATA_PATH+'train.csv', index_col= 0)
X_test = pd.read_csv(DATA_PATH+'test.csv', index_col= 0)
X_train

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [434]:
# xgb = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.5, subsample=0.8) #87.17% | 77.16% | 91.65% 
#xgb = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.5, subsample=1) #87.8% | 76.77% | 93.69% 
#xgb = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.5, subsample=1, max_depth= 10) # 91.5% | 83.22% | 96.52%
#xgb = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.5, subsample=0.9, max_depth= 50) # 98.29% | 96.97% | 99.03%
xgb = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.7, subsample=0.9, max_depth= 100)



In [435]:
xgb.fit(X_train_tfidf, X_train.target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=100,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=2,
              subsample=0.9, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [442]:
pred_xgb = xgb.predict(X_train_tfidf)
stats(xgb, X_train_tfidf,X_train.target.to_list())
# SVM: | 95.82% | 91.78% | 98.39% |

|  acc   |  rec  |  pre  |
--------------------------
| 98.5% | 97.68% | 98.82% |


(3195, 38, 4304, 76)

In [483]:
docs_new = X_test.text.to_list()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

pred_xgb_t = xgb.predict(X_new_tfidf).tolist()
solution = pd.DataFrame(data= {'id': ind, 'target': pred_xgb_t})

In [438]:
solution.to_csv('submission_GDB.csv',index= False)

<h1>Multinomial</h1>

In [439]:
from sklearn.naive_bayes import MultinomialNB
m = MultinomialNB().fit(X_train_tfidf, X_train.target)

In [443]:
pred_m = m.predict(X_train_tfidf)
stats(m, X_train_tfidf,X_train.target.to_list())
# SVM: | 95.82% | 91.78% | 98.39% |

|  acc   |  rec  |  pre  |
--------------------------
| 86.77% | 73.77% | 94.18% |


(2413, 149, 4193, 858)

In [482]:
docs_new = X_test.text.to_list()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

pred_m_t = m.predict(X_new_tfidf).tolist()
solution = pd.DataFrame(data= {'id': ind, 'target': pred_m_t})
solution.to_csv('submission_Multinomial.csv',index= False)

<h1> Boosting </h1>

In [484]:
#df = pd.DataFrame(data= {'id': ind_t,'svm': pred_svm, 'multi': pred_m, 'xgb': pred_xgb})
df = pd.DataFrame(data= {'id': ind,'svm': pred_svm_t, 'multi': pred_m_t, 'xgb': pred_xgb_t})
print(len(pred_m))
print(len(pred_xgb))
print(len(pred_svm))

7613
7613
7613


In [485]:
df.head(3)

Unnamed: 0,id,svm,multi,xgb
0,0,1,1,1
1,2,0,0,0
2,3,1,1,1


In [487]:
df['pred'] = df.apply(lambda x: 1 if(x.svm + x.multi + x.xgb) > 1 else 0, axis= 1)
df.head()

Unnamed: 0,id,svm,multi,xgb,pred
0,0,1,1,1,1
1,2,0,0,0,0
2,3,1,1,1,1
3,9,0,1,0,0
4,11,1,1,0,1


In [475]:
compare_stats(df.pred.to_list(), ind_t)

|  acc   |  rec  |  pre  |
--------------------------
| 100.0% | 100.0% | 100.0% |


(1, 0, 0, 0)

In [488]:
# lets use boosting to predict 
solution = pd.DataFrame(data= {'id': ind, 'target': df.pred.to_list()})
solution.to_csv('submission_bosted.csv',index= False)