In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline

In [24]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [25]:
train_data, validation_data = train_test_split(data, test_size=0.2, random_state=0)

In [26]:
train_data['target'].value_counts(normalize=True)

target
0    0.567488
1    0.432512
Name: proportion, dtype: float64

In [27]:
train_data.sample(10)

Unnamed: 0,id,keyword,location,text,target
3061,4392,earthquake,Global Edition,#earthquake (EMSC): MD 2.9 OFF COAST OF NORTHE...,1
7426,10623,wounded,,Police Officer Wounded Suspect Dead After Exch...,1
2669,3831,detonate,,@WoundedPigeon http://t.co/s9soAeVcVo Detonate...,0
707,1021,blazing,New York,Morgan Silver Dollar 1921 P CH Gem Bu PL Blazi...,0
5010,7146,mudslide,"Crouch End, London",Stu Dorret's mudslide rubber tyre cake may hav...,0
5212,7444,obliterated,,I think I'll get obliterated tonight,0
1579,2280,cliff%20fall,,Currently want to drive my car off a cliff and...,0
3633,5182,fatalities,"Hope Road, Jamaica",'Use our roads wisely and prevent the carnage ...,0
7128,10210,volcano,USA,Japan Aogashima Volcano. By Unknown - Check It...,1
7484,10706,wreck,Baltimore,@girlthatsrio have my uncles wreck their shit,0


In [28]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    StackingClassifier(
        [
            ('gradient_boosting', GradientBoostingClassifier(n_estimators=18, max_depth=8)),
            ('naive_bayes', MultinomialNB()),
            ('passive aggressive', PassiveAggressiveClassifier())
        ],
        cv=6
    )
)

pipeline.fit(train_data['text'], train_data['target'])

In [29]:
print(f'Train f1 score: {f1_score(train_data["target"], pipeline.predict(train_data["text"]))}')

Train f1 score: 0.9329896907216495


In [30]:
print(f'Test f1 score: {f1_score(validation_data["target"], pipeline.predict(validation_data["text"]))}')

Test f1 score: 0.7461669505962522
