In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import random

In [8]:
rand_state = random.seed(12)

train = pd.read_csv('sample_data/train.csv')
test = pd.read_csv('sample_data/test.csv')

train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [9]:
X = train['text']

y = train['target']

test_x = test['text']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = rand_state, shuffle = True)

In [11]:
X_train

Unnamed: 0,text
2740,devastated by today's allegations.
7511,Wreckage 'Conclusively Confirmed' as From MH37...
6473,Hasn't quite sunk in that I saw Johnny Marr an...
2189,Malaysia confirms plane debris washed up on Re...
305,#PBBan (Temporary:300) Russaky89 @'aRmageddon ...
...,...
1909,Disillusioned lead character \nCheck\nHappy go...
1464,#Borrowers concerned at possible #interest rat...
1646,I just collapsed in my bed ugh I'm exhausted
722,@DarrylB1979 yea heard about that..not coming ...


In [20]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train_sub = count_vectorizer.transform(X)
count_sub = count_vectorizer.transform(test_x)
count_nb = MultinomialNB()
count_nb.fit(count_train ,y_train)
count_nb_pred = count_nb.predict(count_test)
count_nb_score = accuracy_score(y_test,count_nb_pred)
print('MNB c_s.: ', count_nb_score)
count_nb_cm = confusion_matrix(y_test, count_nb_pred)
count_nb_cm

MNB c_s.:  0.7925420168067226


array([[925, 158],
       [237, 584]])

In [22]:
count_lsvc = LinearSVC()
count_lsvc.fit(count_train ,y_train)
count_lsvc_pred = count_lsvc.predict(count_test)
count_lsvc_score = accuracy_score(y_test,count_lsvc_pred)
print('LinearSVC c_s.: ', count_lsvc_score)
count_lsvc_cm = confusion_matrix(y_test, count_lsvc_pred)
count_lsvc_cm

LinearSVC c_s.:  0.7836134453781513


array([[911, 172],
       [240, 581]])

In [23]:
count_svc = SVC()
count_svc.fit(count_train ,y_train)
count_svc_pred = count_svc.predict(count_test)
count_svc_score = accuracy_score(y_test,count_svc_pred)
print('SVC c_s.: ', count_svc_score)
count_svc_cm = confusion_matrix(y_test, count_svc_pred)
count_svc_cm

SVC c_s.:  0.7930672268907563


array([[996,  87],
       [307, 514]])

In [27]:
count_nusvc = NuSVC()
count_nusvc.fit(count_train ,y_train)
count_nusvc_pred = count_nusvc.predict(count_test)
count_nusvc_score = accuracy_score(y_test,count_nusvc_pred)
print('NuSVC c_s.: ', count_nusvc_score)
count_nusvc_cm = confusion_matrix(y_test, count_nusvc_pred)
count_nusvc_cm

NuSVC c_s.:  0.7956932773109243


array([[984,  99],
       [290, 531]])

In [30]:
count_lr = LogisticRegression()
count_lr.fit(count_train ,y_train)
count_lr_pred = count_lr.predict(count_test)
count_lr_score = accuracy_score(y_test,count_lr_pred)
print('LogReg c_s.: ', count_lr_score)
count_lr_cm = confusion_matrix(y_test, count_lr_pred)
count_lr_cm

LogReg c_s.:  0.7998949579831933


array([[945, 138],
       [243, 578]])

In [31]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_train_sub = tfidf_vectorizer.transform(X)
tfidf_sub = tfidf_vectorizer.transform(test_x)

tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train, y_train)
tfidf_nb_pred = tfidf_nb.predict(tfidf_test)
tfidf_nb_score = accuracy_score(y_test,tfidf_nb_pred)
print('MNB tfidf_s.: ', tfidf_nb_score)
tfidf_nb_cm = confusion_matrix(y_test, tfidf_nb_pred)
tfidf_nb_cm

MNB tfidf_s.:  0.7977941176470589


array([[986,  97],
       [288, 533]])

In [34]:
tfidf_svc = LinearSVC()
tfidf_svc.fit(tfidf_train, y_train)
tfidf_svc_pred = tfidf_svc.predict(tfidf_test)
tfidf_svc_score = accuracy_score(y_test,tfidf_svc_pred)
print("LinSVC s.:   %0.8f" % tfidf_svc_score)
svc_cm = confusion_matrix(y_test, tfidf_svc_pred)
svc_cm

LinSVC s.:   0.78518908


array([[894, 189],
       [220, 601]])

In [35]:
tfidf_svc0 = SVC()
tfidf_svc0.fit(tfidf_train, y_train)
tfidf_svc_pred0 = tfidf_svc.predict(tfidf_test)
tfidf_svc_score0 = accuracy_score(y_test,tfidf_svc_pred0)
print("SVC s.:   %0.8f" % tfidf_svc_score0)
svc_cm0 = confusion_matrix(y_test, tfidf_svc_pred0)
classification_report(y_test, tfidf_svc_pred0)
svc_cm0

SVC s.:   0.78518908


array([[894, 189],
       [220, 601]])

In [36]:
tfidf_nusvc = NuSVC()
tfidf_nusvc.fit(tfidf_train, y_train)
tfidf_nusvc_pred = tfidf_nusvc.predict(tfidf_test)
tfidf_nusvc_score = accuracy_score(y_test,tfidf_nusvc_pred)
print("NuSVC s.:   %0.8f" % tfidf_nusvc_score)
nusvc_cm = confusion_matrix(y_test, tfidf_nusvc_pred)
classification_report(y_test, tfidf_nusvc_pred)
nusvc_cm

NuSVC s.:   0.78991597


array([[946, 137],
       [263, 558]])

In [39]:
tfidf_lr = LogisticRegression()
tfidf_lr.fit(tfidf_train, y_train)
tfidf_lr_pred = tfidf_lr.predict(tfidf_test)
tfidf_lr_score = accuracy_score(y_test,tfidf_lr_pred)
print("LogReg s.:   %0.8f" % tfidf_lr_score)
lr_cm = confusion_matrix(y_test, tfidf_lr_pred)
lr_cm

LogReg s.:   0.78623950


array([[948, 135],
       [272, 549]])

In [40]:
sample_sub=pd.read_csv('sample_data/sample_submission.csv')
sample_sub

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


Были опробованы методы обучения NuSCV, MNB, LogisticalRegression и SCV, лучший результат выдал NuSCV.

In [45]:
count_nusvc.fit(count_train_sub ,y)
count_nusvc_sub = count_lr.predict(count_sub)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':count_nusvc_sub})
sub.to_csv('submission.csv',index=False)