In [None]:
from datasets import load_dataset
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [None]:
dataset = load_dataset('cardiffnlp/tweet_topic_single')

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset["train_all"]["text"]) # type: ignore

X_test_counts = count_vect.transform(dataset["test_2021"]["text"]) # type: ignore

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
clf = MultinomialNB().fit(X_train_counts, dataset["train_all"]["label"]) # type: ignore

predicted = clf.predict(X_test_counts)

print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore



In [None]:
clf = MultinomialNB().fit(X_train_tfidf, dataset["train_all"]["label"]) # type: ignore

predicted = clf.predict(X_test_counts)

print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore

In [None]:
count_vect = CountVectorizer(binary=True)
X_train_counts_b = count_vect.fit_transform(dataset["train_all"]["text"]) # type: ignore

X_test_counts_b = count_vect.transform(dataset["test_2021"]["text"]) # type: ignore

tfidf_transformer = TfidfTransformer()
X_train_tfidf_b = tfidf_transformer.fit_transform(X_train_counts_b)

In [None]:
clf = MultinomialNB().fit(X_train_counts_b, dataset["train_all"]["label"]) # type: ignore

predicted = clf.predict(X_test_counts_b)

print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore


In [None]:
clf = MultinomialNB().fit(X_train_tfidf_b, dataset["train_all"]["label"]) # type: ignore

predicted = clf.predict(X_test_counts_b)

print(np.mean(predicted == dataset["test_2021"]["label"])) # type: ignore

print(f1_score(dataset["test_2021"]["label"], predicted, average='macro')) # type: ignore

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
docs = dataset["train_all"]["text"] # type: ignore	

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs) # type: ignore

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.7)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 6
chunksize = 2000
passes = 20 ##################################################20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=1
)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
from sklearn.metrics import accuracy_score
docs_test2021_label = dataset["test_2021"]["label"]


docs_test2021_text = dataset["test_2021"]["text"]
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs_test2021_label)):
    docs_test2021_text[idx] = docs_test2021_text[idx].lower()  # Convert to lowercase.
    docs_test2021_text[idx] = tokenizer.tokenize(docs_test2021_text[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs_test2021_text = [[token for token in doc if not token.isnumeric()] for doc in docs_test2021_text]
# Remove words that are only one character.
docs_test2021_text = [[token for token in doc if len(token) > 1] for doc in docs_test2021_text]

lemmatizer = WordNetLemmatizer()
docs_test2021_text = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs_test2021_text]

# Prepare the testing data
test_corpus = [dictionary.doc2bow(doc) for doc in docs_test2021_text]

# Infer topic distributions for the test documents
test_topic_distributions = [model[doc] for doc in test_corpus]

# Convert topic distributions into labels
predicted_labels = [max(doc, key=lambda x: x[1])[0] for doc in test_topic_distributions]

# Calculate accuracy
accuracy = accuracy_score(docs_test2021_label, predicted_labels)
print("Accuracy:", accuracy)

print("f1_score:", f1_score(docs_test2021_label, predicted_labels, average='macro')) # type: ignore