In [6]:
import re
import numpy as np
import pandas as pd

from utils import preprocess, plotutils
from wordsegment import load, segment
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline

from tqdm import tqdm

In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"

In [9]:
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')

In [56]:
%%time

tweet_doc = preprocess.spacy_pipeline(ori_train_data['tweet'].to_list())
tweet_normalized_wo_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=True)
#tweet_normalized_with_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=False)


Wall time: 1min 21s


In [68]:

#tweets = preprocess.join_as_sentence(tweet_normalized_with_stop)
#tweet_normalized_with_stop = preprocess.remove_user_mask(tweet_normalized_with_stop)
tweet_normalized_wo_stop = preprocess.remove_user_mask(tweet_normalized_wo_stop)
#tweets_normalized_wo_stop_processed = preprocess.convert_abbrev_in_text(tweet_normalized_wo_stop)
tweet_nor_url_wo_abbre_hash = preprocess.remove_url(tweet_normalized_wo_stop)

tweet_nor_wo_abbre_hash = preprocess.remove_hashtag(tweet_nor_url_wo_abbre_hash)

tweets_wo_stop = preprocess.join_as_sentence(tweet_nor_wo_abbre_hash)
#tweets_with_stop = preprocess.join_as_sentence(tweet_normalized_with_stop)
#tweets_word_stop_processed = preprocess.join_as_sentence(tweets_normalized_wo_stop_processed)
#tweets_word_stop_processed = preprocess.process_tweets(tweets_wo_stop)


In [59]:
print(tweets_wo_stop)



# Subtask A

In [64]:
label2id = {'NOT': 0, 'OFF': 1}
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()

sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets_wo_stop, sub_a_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 10592
Size of test set: 2648


In [65]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('naive_bayes', ComplementNB()),
])

In [66]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3, 0.4, 0.5),
    'tfidf__max_features': (5000, 10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3), (1, 4), (1, 5)),
}

In [10]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

Wall time: 14.7 s


In [11]:
print(grid_search.best_params_)

{'tfidf__max_df': 0.2, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 5)}


In [67]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=5000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = ComplementNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.80      0.75      0.77      1750
         OFF       0.56      0.64      0.60       898

    accuracy                           0.71      2648
   macro avg       0.68      0.69      0.69      2648
weighted avg       0.72      0.71      0.71      2648



# Subtask B

In [17]:
sub_b_idx = list(ori_train_data[ori_train_data['subtask_b'].notna()].index)

In [18]:
tweets = [t for i, t in enumerate(tweets) if i in sub_b_idx]

In [19]:
label2id = {'UNT': 0, 'TIN': 1}

In [20]:
sub_b_label = ori_train_data.loc[sub_b_idx]['subtask_b'].map(label2id).to_list()

In [21]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_b_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 3520
Size of test set: 880


In [22]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('naive_bayes', ComplementNB()),
])

In [23]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3, 0.4, 0.5),
    'tfidf__max_features': (5000, 10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3), (1, 4), (1, 5)),
}

In [25]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

Wall time: 8.45 s


In [26]:
print(grid_search.best_params_)

{'tfidf__max_df': 0.3, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 5)}


In [27]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=5000, max_df=0.3)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = ComplementNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['UNT', 'TIN']))

              precision    recall  f1-score   support

         UNT       0.33      0.14      0.19       117
         TIN       0.88      0.96      0.92       763

    accuracy                           0.85       880
   macro avg       0.61      0.55      0.56       880
weighted avg       0.81      0.85      0.82       880



# Subtask C

In [None]:
sub_c_idx = list(ori_train_data[ori_train_data['subtask_c'].notna()].index)

In [None]:
tweets = [t for i, t in enumerate(tweets_with_stop) if i in sub_c_idx]

In [None]:
label2id = {'IND': 0, 'GRP': 1, 'OTH': 2}

In [None]:
sub_c_label = ori_train_data.loc[sub_c_idx]['subtask_c'].map(label2id).to_list()

In [None]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_c_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('naive_bayes', ComplementNB()),
])

In [None]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3, 0.4, 0.5),
    'tfidf__max_features': (5000, 10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3), (1, 4), (1, 5)),
}

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)


In [None]:
print(grid_search.best_params_)

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=5000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = ComplementNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['IND', 'GRP', 'OTH']))