In [57]:
import re
import numpy as np
import pandas as pd

from utils import preprocess, plotutils

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline

from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"

# Prepare data

In [4]:
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')
label2id = {'NOT': 0, 'OFF': 1}

In [5]:
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()

In [6]:
%%time

tweet_doc = preprocess.spacy_pipeline(ori_train_data['tweet'].to_list())
tweet_normalized_wo_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=True)
tweet_normalized_with_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=False)

CPU times: total: 1min 22s
Wall time: 1min 23s


Remove the "@user" mask.

In [7]:
tweet_normalized_with_stop = preprocess.remove_user_mask(tweet_normalized_with_stop)
tweet_normalized_wo_stop = preprocess.remove_user_mask(tweet_normalized_wo_stop)

In [35]:
tweets = preprocess.join_as_sentence(tweet_normalized_wo_stop)
#tweets = preprocess.join_as_sentence(tweet_normalized_with_stop)

# Naive Bayes

In [43]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_a_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 10592
Size of test set: 2648


In [44]:
# Create Document-Term Matrix for differen n-gram sizes
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=10000)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

In [45]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

In [49]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (5000, 10000, 20000),
    'tfidf__ngram_range': ((1, 3), (1, 4), (1, 5)),
    'nb__alpha': (0.01, 0.02, 0.03)
}

In [50]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='recall')
grid_search = grid_search.fit(sentences_train, labels_train)

CPU times: total: 2.84 s
Wall time: 1min 10s


In [51]:
print(grid_search.best_params_)

{'nb__alpha': 0.01, 'tfidf__max_df': 0.1, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 3)}


Below are best params:
```
{'nb__alpha': 0.01, 'tfidf__max_df': 0.1, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 3)}
```

In [55]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000, max_df=0.1)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = MultinomialNB(alpha=0.01).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.74      0.87      0.80      1750
         OFF       0.63      0.42      0.50       898

    accuracy                           0.72      2648
   macro avg       0.69      0.64      0.65      2648
weighted avg       0.70      0.72      0.70      2648



# LogReg

In [77]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression()),
])

In [78]:
parameters = {
    'tfidf__max_df': (0.3, 0.4, 0.5),
    'tfidf__max_features': (3000, 5000, 10000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'logreg__C': (0.1, 1.0, 10.0),
}

In [82]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf)
grid_search = grid_search.fit(sentences_train, labels_train)

CPU times: total: 2.77 s
Wall time: 48.8 s


In [83]:
print(grid_search.best_params_)

{'logreg__C': 1.0, 'tfidf__max_df': 0.3, 'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 1)}


Below are best params:
```
{'logreg__C': 1.0, 'tfidf__max_df': 0.3, 'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 1)}
```

In [84]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=3000, max_df=0.3)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(C=1.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.76      0.94      0.84      1750
         OFF       0.78      0.41      0.54       898

    accuracy                           0.76      2648
   macro avg       0.77      0.68      0.69      2648
weighted avg       0.76      0.76      0.74      2648

