Natural Language Processing with Disaster Tweets: https://www.kaggle.com/c/nlp-getting-started

In [100]:
import warnings
warnings.filterwarnings('ignore')

In [101]:
from collections import Counter

import numpy as mp
import pandas as pd

In [102]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split

In [103]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [104]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize

from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gleb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gleb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading data

In [105]:
df_train = pd.read_csv('csv/train.csv') # TODO: try with location
texts_train = df_train['text']
target_train = df_train['target']

In [131]:
df_test = pd.read_csv('csv/test.csv')
sample_submission = pd.read_csv('csv/sample_submission.csv')
texts_test = df_test['text']

In [107]:
texts_test.head(5)

0                   Just happened a terrible car crash
1    Heard about #earthquake is different cities, s...
2    there is a forest fire at spot pond, geese are...
3             Apocalypse lighting. #Spokane #wildfires
4        Typhoon Soudelor kills 28 in China and Taiwan
Name: text, dtype: object

# Feature building

### POS features

In [108]:
def pos_count(text):
    result = {
        pos: 0 for pos in morph.TagClass.PARTS_OF_SPEECH
    }
    if not text:
        return result
    result.update(
        Counter(
            (morph.parse(token)[0].tag.POS for token in word_tokenize(text))
        )
    )
    return result

In [109]:
pos_train = pd.DataFrame(
    pos_count(row) for row in texts_train
)

In [110]:
pos_test = pd.DataFrame(
    pos_count(row) for row in texts_test
)

### TF-idf

In [111]:
print(*nltk.corpus.stopwords.words('english'), sep=' ')

i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't


In [112]:
def tfidf_tokenize(text):
    return list(
        map(
            lambda word: morph.parse(word)[0].normal_form,word_tokenize(text)
        )
    )

In [113]:
tfidf_vect = TfidfVectorizer(
    tokenizer=tfidf_tokenize,
    max_df=0.95,
    min_df=3,
    ngram_range=(1, 2),
    max_features=100_000,
    stop_words=nltk.corpus.stopwords.words('english'),
)

In [114]:
tfidf_train = pd.DataFrame.sparse.from_spmatrix(
    tfidf_vect.fit_transform(texts_train)
)

In [115]:
tfidf_test = pd.DataFrame.sparse.from_spmatrix(
    tfidf_vect.transform(texts_test)
)

### Merging features

In [116]:
features_train = pd.concat([pos_train, tfidf_train], axis=1)
features_train.head(5)

Unnamed: 0,NOUN,ADJS,NPRO,PRED,INTJ,PRCL,PREP,GRND,NUMR,VERB,...,8283,8284,8285,8286,8287,8288,8289,8290,8291,8292
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
features_test = pd.concat([pos_test, tfidf_test], axis=1)
features_test.head(5)

Unnamed: 0,NOUN,ADJS,NPRO,PRED,INTJ,PRCL,PREP,GRND,NUMR,VERB,...,8283,8284,8285,8286,8287,8288,8289,8290,8291,8292
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Splitting

In [118]:
train_indices, valid_indices = train_test_split(features_train.index, test_size=0.1, stratify=target_train)
len(train_indices), len(valid_indices)

(6851, 762)

In [119]:
features_train, features_valid = features_train.loc[train_indices], features_train.loc[valid_indices]
target_train, target_valid = target_train[train_indices], target_train[valid_indices]

# Utils

In [120]:
def precision_macro(y_true, y_pred):
    return precision_score(y_true, y_pred, average='macro')

def recall_macro(y_true, y_pred):
    return recall_score(y_true, y_pred, average='macro')

def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')


def precision_micro(y_true, y_pred):
    return precision_score(y_true, y_pred, average='micro')

def recall_micro(y_true, y_pred):
    return recall_score(y_true, y_pred, average='micro')

def f1_micro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')


def get_scores(y_true, y_pred):
    metrics = {
        'Accuracy': accuracy_score,

        '\nPrecision (macro)': precision_macro,
        'Recall (macro)': recall_macro,
        'F1 (macro)': f1_macro,

        '\nPrecision (micro)': precision_micro,
        'Recall (micro)': recall_micro,
        'F1 (micro)': f1_micro,
    }
    return {
        name: scorer(y_true, y_pred)
        for name, scorer in metrics.items()
    }

def score_model(model, X, y):
    y_pred = model.predict(X)
    for name, score in get_scores(y, y_pred).items():
        print(f'{name}: {score:.4f}')


f1_micro_scorer = make_scorer(f1_micro)

In [132]:
def make_submission(model, X):
    sample_submission['target'] = model.predict(X)
    sample_submission.to_csv('csv/submission.csv', index=False)

# Model

In [122]:
target_train.head(5)

5525    1
5398    0
5723    1
6473    0
4692    1
Name: target, dtype: int64

In [123]:
ridge_model = RidgeClassifier(

).fit(features_train, target_train)

In [124]:
score_model(ridge_model, features_train, target_train)

Accuracy: 0.9235

Precision (macro): 0.9303
Recall (macro): 0.9154
F1 (macro): 0.9208

Precision (micro): 0.9235
Recall (micro): 0.9235
F1 (micro): 0.9235


In [125]:
score_model(ridge_model, features_valid, target_valid)

Accuracy: 0.8110

Precision (macro): 0.8109
Recall (macro): 0.8011
F1 (macro): 0.8044

Precision (micro): 0.8110
Recall (micro): 0.8110
F1 (micro): 0.8110


In [133]:
make_submission(ridge_model, features_test)