In [112]:
import re
import numpy as np
import pandas as pd

from utils import preprocess, plotutils

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline

from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = "./data/"
TRAIN_DATA_FILE = DATA_DIR + "olid-training-v1.0.tsv"

# Prepare data

In [291]:
ori_train_data = pd.read_csv(TRAIN_DATA_FILE, sep='\t')

In [6]:
%%time

tweet_doc = preprocess.spacy_pipeline(ori_train_data['tweet'].to_list())
tweet_normalized_wo_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=True)
tweet_normalized_with_stop = preprocess.spacy_normalize(tweet_doc, stop_removal=False)

CPU times: total: 1min 22s
Wall time: 1min 23s


Remove the "@user" mask.

In [7]:
tweet_normalized_with_stop = preprocess.remove_user_mask(tweet_normalized_with_stop)
tweet_normalized_wo_stop = preprocess.remove_user_mask(tweet_normalized_wo_stop)

In [289]:
tweets_wo_stop = preprocess.join_as_sentence(tweet_normalized_wo_stop)
tweets_with_stop = preprocess.join_as_sentence(tweet_normalized_with_stop)

# LogReg

## sub a

In [292]:
sub_a_label = ori_train_data['subtask_a'].map(label2id).to_list()

In [293]:
label2id = {'NOT': 0, 'OFF': 1}

In [294]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets_wo_stop, sub_a_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 10592
Size of test set: 2648


In [222]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear')),
])

In [227]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'logreg__C': (1.0, 10.0, 100.0)
}

In [228]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

CPU times: total: 1.47 s
Wall time: 21.7 s


In [225]:
print(grid_search.best_params_)

{'logreg__C': 10.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 1)}


Below are best params:
```
{'logreg__C': 10.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 1)}
```

In [229]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=15000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(solver='liblinear', C=10.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['NOT', 'OFF']))

              precision    recall  f1-score   support

         NOT       0.78      0.87      0.83      1750
         OFF       0.68      0.53      0.60       898

    accuracy                           0.76      2648
   macro avg       0.73      0.70      0.71      2648
weighted avg       0.75      0.76      0.75      2648



## sub b

In [129]:
sub_b_idx = list(ori_train_data[ori_train_data['subtask_b'].notna()].index)

In [200]:
tweets = [t for i, t in enumerate(tweets) if i in sub_b_idx]

In [201]:
label2id = {'UNT': 0, 'TIN': 1}

In [202]:
sub_b_label = ori_train_data.loc[sub_b_idx]['subtask_b'].map(label2id).to_list()

In [203]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_b_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 3520
Size of test set: 880


In [208]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear')),
])

In [213]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (10000, 15000, 20000),
    'tfidf__ngram_range': ((1, 2), (1, 3), (1, 4)),
    'logreg__C': (1.0, 10.0, 100.0)
}

In [210]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

CPU times: total: 1.73 s
Wall time: 25.4 s


In [211]:
print(grid_search.best_params_)

{'logreg__C': 100.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 3)}


Below are best params:
```
{'logreg__C': 100.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 15000, 'tfidf__ngram_range': (1, 3)}
```

In [212]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=15000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(solver='liblinear', C=100.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['UNT', 'TIN']))

              precision    recall  f1-score   support

         UNT       0.34      0.14      0.20       117
         TIN       0.88      0.96      0.92       763

    accuracy                           0.85       880
   macro avg       0.61      0.55      0.56       880
weighted avg       0.81      0.85      0.82       880



## sub c

In [230]:
sub_c_idx = list(ori_train_data[ori_train_data['subtask_c'].notna()].index)

In [290]:
tweets = [t for i, t in enumerate(tweets_with_stop) if i in sub_c_idx]

In [265]:
label2id = {'IND': 0, 'GRP': 1, 'OTH': 2}

In [266]:
sub_c_label = ori_train_data.loc[sub_c_idx]['subtask_c'].map(label2id).to_list()

In [267]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(tweets, sub_c_label, test_size=0.2, random_state=5246)

y_train = np.asarray(labels_train)
y_test = np.asarray(labels_test)

print("Size of training set: {}".format(len(sentences_train)))
print("Size of test set: {}".format(len(sentences_test)))

Size of training set: 3100
Size of test set: 776


In [280]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear')),
])

In [281]:
parameters = {
    'tfidf__max_df': (0.1, 0.2, 0.3),
    'tfidf__max_features': (15000, 20000, 25000),
    'tfidf__ngram_range': ((1, 3), (1, 4), (1, 5), (1, 6)),
    'logreg__C': (10.0, 100.0, 1000.0)
}

In [282]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5246)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=kf, scoring='f1_macro')
grid_search = grid_search.fit(sentences_train, labels_train)

CPU times: total: 3.48 s
Wall time: 1min 30s


In [283]:
print(grid_search.best_params_)

{'logreg__C': 100.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 25000, 'tfidf__ngram_range': (1, 5)}


Below are best params:
```
{'logreg__C': 100.0, 'tfidf__max_df': 0.2, 'tfidf__max_features': 25000, 'tfidf__ngram_range': (1, 5)}
```

In [286]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=25000, max_df=0.2)

X_train = tfidf_vectorizer.fit_transform(sentences_train)
X_test = tfidf_vectorizer.transform(sentences_test)

model = LogisticRegression(solver='liblinear', C=100.0).fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=['IND', 'GRP', 'OTH']))

              precision    recall  f1-score   support

         IND       0.79      0.85      0.82       500
         GRP       0.56      0.60      0.58       203
         OTH       0.27      0.08      0.13        73

    accuracy                           0.71       776
   macro avg       0.54      0.51      0.51       776
weighted avg       0.68      0.71      0.69       776

