In [2]:
import re
import numpy as np
import pandas as pd

from utils import preprocess, plotutils

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline

from tqdm import tqdm

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"

# Prepare data

In [5]:
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')

In [6]:
%%time

tweet_doc = preprocess.spacy_pipeline(ori_train_data['tweet'].to_list())
tweet_normalized_wo_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=True)
tweet_normalized_with_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=False)

CPU times: user 1min 1s, sys: 426 ms, total: 1min 2s
Wall time: 1min 2s


Remove the "@user" mask.

In [7]:
tweet_normalized_with_stop = preprocess.remove_user_mask(tweet_normalized_with_stop)
tweet_normalized_wo_stop = preprocess.remove_user_mask(tweet_normalized_wo_stop)

In [8]:
tweets_wo_stop = preprocess.join_as_sentence(tweet_normalized_wo_stop)
tweets_with_stop = preprocess.join_as_sentence(tweet_normalized_with_stop)

# LogReg

## sub a

In [9]:
label2id = {'NOT': 0, 'OFF': 1}
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()

In [10]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets_wo_stop, sub_a_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 10592
Size of test set: 2648


In [11]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear')),
])

In [12]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'logreg__C': (1.0, 10.0, 100.0)
}

In [12]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

CPU times: user 1.49 s, sys: 241 ms, total: 1.74 s
Wall time: 9.64 s


In [13]:
print(grid_search.best_params_)

{'logreg__C': 10.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 1)}


Below are best params:
```
{'logreg__C': 10.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 1)}
```

In [14]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=15000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(solver='liblinear', C=10.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.79      0.87      0.83      1750
         OFF       0.68      0.54      0.60       898

    accuracy                           0.76      2648
   macro avg       0.74      0.71      0.72      2648
weighted avg       0.75      0.76      0.75      2648



### sub a_SVM

In [13]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVM', SVC(kernel='rbf')),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'SVM__C': (0.6, 0.8, 1.0)
}

In [14]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

print(grid_search.best_params_)

{'SVM__C': 1.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 1)}
CPU times: user 11.1 s, sys: 608 ms, total: 11.7 s
Wall time: 3min 17s


In [17]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=10000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = SVC(C = 1.0, kernel = "rbf").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.75      0.95      0.84      1750
         OFF       0.81      0.38      0.52       898

    accuracy                           0.76      2648
   macro avg       0.78      0.67      0.68      2648
weighted avg       0.77      0.76      0.73      2648



### sub a_Random Forest

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('RandomForest', RandomForestClassifier(criterion='gini', n_jobs=-1, class_weight="balanced")),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'RandomForest__n_estimators': (80, 100, 150, 200)
}

In [13]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

print(grid_search.best_params_)

{'RandomForest__n_estimators': 200, 'tfidf__max_df': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 3)}
CPU times: user 13.3 s, sys: 357 ms, total: 13.7 s
Wall time: 4min 34s


In [12]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, max_df=0.1)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = RandomForestClassifier(n_estimators=150, criterion='gini').fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.78      0.91      0.84      1750
         OFF       0.74      0.49      0.59       898

    accuracy                           0.77      2648
   macro avg       0.76      0.70      0.72      2648
weighted avg       0.77      0.77      0.76      2648



In [15]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000, max_df=0.1)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = RandomForestClassifier(n_estimators=200, criterion='gini', n_jobs=-1, class_weight="balanced").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.78      0.92      0.84      1750
         OFF       0.75      0.48      0.59       898

    accuracy                           0.77      2648
   macro avg       0.76      0.70      0.71      2648
weighted avg       0.77      0.77      0.75      2648



## sub b

In [29]:
sub_b_idx = list(ori_train_data[ori_train_data['subtask_b'].notna()].index)
tweets = [t for i, t in enumerate(tweets_with_stop) if i in sub_b_idx]
label2id = {'UNT': 0, 'TIN': 1}
sub_b_label = ori_train_data.loc[sub_b_idx]['subtask_b'].map(label2id).to_list()

In [30]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_b_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 3520
Size of test set: 880


In [31]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear')),
])

In [32]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 2), (1, 3), (1, 4)),
    'logreg__C': (1.0, 10.0, 100.0)
}

In [33]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)
print(grid_search.best_params_)

{'logreg__C': 100.0, 'tfidf__max_df': 0.3, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 2)}
CPU times: user 1.33 s, sys: 98.4 ms, total: 1.43 s
Wall time: 13.2 s


Below are best params:
```
{'logreg__C': 100.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 3)}
```

In [34]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=15000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(solver='liblinear', C=100.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['UNT', 'TIN']))

              precision    recall  f1-score   support

         UNT       0.37      0.12      0.18       117
         TIN       0.88      0.97      0.92       763

    accuracy                           0.86       880
   macro avg       0.62      0.54      0.55       880
weighted avg       0.81      0.86      0.82       880



### sub b_SVM

In [70]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVM', SVC(kernel='rbf')),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 2), (1, 3), (1, 4)),
    'SVM__C': (0.6, 0.8, 1.0)
}

In [71]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)
print(grid_search.best_params_)

{'SVM__C': 1.0, 'tfidf__max_df': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 3)}
CPU times: user 1.78 s, sys: 226 ms, total: 2.01 s
Wall time: 39.2 s


In [72]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000, max_df=0.1)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = SVC(C = 1.0, kernel = "rbf").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['UNT', 'TIN']))

              precision    recall  f1-score   support

         UNT       0.40      0.02      0.03       117
         TIN       0.87      1.00      0.93       763

    accuracy                           0.87       880
   macro avg       0.63      0.51      0.48       880
weighted avg       0.81      0.87      0.81       880



### sub b_Random Forest

In [18]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('RandomForest', RandomForestClassifier(criterion='gini', n_jobs=-1, class_weight="balanced")),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 2), (1, 3), (1, 4)),
    'RandomForest__n_estimators': (80, 100, 150, 200)
}

In [19]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)
print(grid_search.best_params_)

{'RandomForest__n_estimators': 150, 'tfidf__max_df': 0.3, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
CPU times: user 2.81 s, sys: 224 ms, total: 3.04 s
Wall time: 1min 45s


In [75]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 4), max_features=20000, max_df=0.1)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = RandomForestClassifier(n_estimators=80, criterion='gini', n_jobs=-1, class_weight="balanced").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['UNT', 'TIN']))

              precision    recall  f1-score   support

         UNT       0.44      0.07      0.12       117
         TIN       0.87      0.99      0.93       763

    accuracy                           0.86       880
   macro avg       0.66      0.53      0.52       880
weighted avg       0.82      0.86      0.82       880



In [20]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, max_df=0.3)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = RandomForestClassifier(n_estimators=150, criterion='gini', n_jobs=-1, class_weight="balanced").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['UNT', 'TIN']))

              precision    recall  f1-score   support

         UNT       0.30      0.18      0.23       117
         TIN       0.88      0.94      0.91       763

    accuracy                           0.84       880
   macro avg       0.59      0.56      0.57       880
weighted avg       0.80      0.84      0.82       880



## sub c

In [21]:
sub_c_idx = list(ori_train_data[ori_train_data['subtask_c'].notna()].index)
tweets = [t for i, t in enumerate(tweets_with_stop) if i in sub_c_idx]
label2id = {'IND': 0, 'GRP': 1, 'OTH': 2}
sub_c_label = ori_train_data.loc[sub_c_idx]['subtask_c'].map(label2id).to_list()

In [22]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_c_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 3100
Size of test set: 776


In [23]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear')),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (15000, 20000, 25000),
    'tfidf__ngram_range': ((1, 3), (1, 4), (1, 5), (1, 6)),
    'logreg__C': (10.0, 100.0, 1000.0)
}

In [81]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

print(grid_search.best_params_)

{'logreg__C': 100.0, 'tfidf__max_df': 0.3, 'tfidf__max_features': 25000, 'tfidf__ngram_range': (1, 6)}
CPU times: user 2.93 s, sys: 325 ms, total: 3.26 s
Wall time: 30.4 s


Below are best params:
```
{'logreg__C': 100.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 25000, 'tfidf__ngram_range': (1, 5)}
```

In [82]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=25000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(solver='liblinear', C=100.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['IND', 'GRP', 'OTH']))

              precision    recall  f1-score   support

         IND       0.79      0.84      0.81       500
         GRP       0.55      0.60      0.57       203
         OTH       0.26      0.08      0.12        73

    accuracy                           0.70       776
   macro avg       0.53      0.51      0.50       776
weighted avg       0.68      0.70      0.69       776



### sub c_SVM

In [83]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVM', SVC(kernel='rbf')),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (15000, 20000, 25000),
    'tfidf__ngram_range': ((1, 3), (1, 4), (1, 5), (1, 6)),
    'SVM__C': (0.6, 0.8, 1.0)
}

In [84]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

print(grid_search.best_params_)

{'SVM__C': 1.0, 'tfidf__max_df': 0.3, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 6)}
CPU times: user 3.83 s, sys: 345 ms, total: 4.18 s
Wall time: 2min 1s


In [85]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 6), max_features=15000, max_df=0.3)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = SVC(C = 1.0, kernel = "rbf").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['IND', 'GRP', 'OTH']))

              precision    recall  f1-score   support

         IND       0.73      0.93      0.82       500
         GRP       0.61      0.40      0.49       203
         OTH       1.00      0.01      0.03        73

    accuracy                           0.71       776
   macro avg       0.78      0.45      0.44       776
weighted avg       0.72      0.71      0.66       776



### sub c_Random Forest

In [25]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('RandomForest', RandomForestClassifier(criterion='gini', n_jobs=-1, class_weight="balanced")),
])

parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (15000, 20000, 25000),
    'tfidf__ngram_range': ((1, 3), (1, 4), (1, 5), (1, 6)),
    'RandomForest__n_estimators': (80, 100, 150, 200),
}

In [26]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

print(grid_search.best_params_)

{'RandomForest__n_estimators': 200, 'tfidf__max_df': 0.2, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 3)}
CPU times: user 4.72 s, sys: 453 ms, total: 5.17 s
Wall time: 2min 54s


In [28]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=20000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = RandomForestClassifier(n_estimators=200, criterion='gini', n_jobs=-1, class_weight="balanced").fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['IND', 'GRP', 'OTH']))

              precision    recall  f1-score   support

         IND       0.77      0.92      0.84       500
         GRP       0.64      0.57      0.60       203
         OTH       1.00      0.01      0.03        73

    accuracy                           0.74       776
   macro avg       0.80      0.50      0.49       776
weighted avg       0.76      0.74      0.70       776

