In [None]:
from datasets import load_dataset
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [None]:
dataset = load_dataset('cardiffnlp/tweet_topic_single')

In [None]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)),
])

text_clf.fit(dataset["train_all"]["text"], dataset["train_all"]["label"]) # type: ignore

predicted = text_clf.predict(dataset["test_2021"]["text"]) # type: ignore
print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore

In [None]:
text_clf_tfidf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)),
])

text_clf_tfidf.fit(dataset["train_all"]["text"], dataset["train_all"]["label"]) # type: ignore

predicted = text_clf_tfidf.predict(dataset["test_2021"]["text"]) # type: ignore
print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore

In [None]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)),
])

text_clf.fit(dataset["train_all"]["text"], dataset["train_all"]["label"]) # type: ignore

predicted = text_clf.predict(dataset["test_2021"]["text"]) # type: ignore
print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore

In [None]:
text_clf_tfidf = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)),
])

text_clf_tfidf.fit(dataset["train_all"]["text"], dataset["train_all"]["label"]) # type: ignore

predicted = text_clf_tfidf.predict(dataset["test_2021"]["text"]) # type: ignore
print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
}


In [None]:
gs_clf = GridSearchCV(text_clf_tfidf, parameters, cv=5, n_jobs=-1)

gs_clf = gs_clf.fit(dataset["train_all"]["text"], dataset["train_all"]["label"]) # type: ignore

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


predicted = gs_clf.predict(dataset["test_2021"]["text"]) # type: ignore
print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore