In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

target_names = [
    "Big Tech & Startups",
    "Science & Futuristic Technology",
    "Programming, Design & Data Science",
]

In [None]:
def load_datasets(prefix):
    train_df = pd.read_pickle(f'data/{prefix}_training.pkl')
    validation_df = pd.read_pickle(f'data/{prefix}_validation.pkl')
    test_df = pd.read_pickle(f'data/{prefix}_test.pkl')

    return train_df, validation_df, test_df

In [None]:
# Load the datasets
articles_train_df, articles_validation_df, articles_test_df = load_datasets("articles")
sentences_train_df, sentences_validation_df, sentences_test_df = load_datasets("sentences")

# Prepare the data and labels for articles
articles_training_data = articles_train_df['text'].tolist()
articles_training_labels = articles_train_df['category_code'].tolist()
articles_test_data = articles_test_df['text'].tolist()
articles_test_labels = articles_test_df['category_code'].tolist()

# Prepare the data and labels for sentences
sentences_training_data = sentences_train_df['text'].tolist()
sentences_training_labels = sentences_train_df['category_code'].tolist()
sentences_test_data = sentences_test_df['text'].tolist()
sentences_test_labels = sentences_test_df['category_code'].tolist()

In [None]:
def train_and_evaluate_multinomial_nb(training_data, training_labels, test_data, test_labels):
    vectorizer = CountVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    classifier = MultinomialNB()
    classifier.fit(training_features, training_labels)
    predictions = classifier.predict(test_features)

    return classification_report(test_labels, predictions, target_names=target_names)

In [None]:
def train_and_evaluate_logistic_regression(training_data, training_labels, test_data, test_labels):
    vectorizer = CountVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression())
    pipeline.fit(training_features, training_labels)
    predictions = pipeline.predict(test_features)

    return classification_report(test_labels, predictions, target_names=target_names)

In [None]:
def train_and_evaluate_tfidf_logistic_regression(training_data, training_labels, test_data, test_labels):
    vectorizer = TfidfVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    classifier = LogisticRegression()
    classifier.fit(training_features, training_labels)
    predictions = classifier.predict(test_features)

    return classification_report(test_labels, predictions, target_names=target_names)

In [None]:
print("MultinomialNB for articles:")
print(train_and_evaluate_multinomial_nb(articles_training_data, articles_training_labels, articles_test_data, articles_test_labels))

print("Logistic Regression for articles:")
print(train_and_evaluate_logistic_regression(articles_training_data, articles_training_labels, articles_test_data, articles_test_labels))

print("TF-IDF Logistic Regression for articles:")
print(train_and_evaluate_tfidf_logistic_regression(articles_training_data, articles_training_labels, articles_test_data, articles_test_labels))

In [None]:
print("MultinomialNB for sentences:")
print(train_and_evaluate_multinomial_nb(sentences_training_data, sentences_training_labels, sentences_test_data, sentences_test_labels))

print("Logistic Regression for sentences:")
print(train_and_evaluate_logistic_regression(sentences_training_data, sentences_training_labels, sentences_test_data, sentences_test_labels))

print("TF-IDF Logistic Regression for sentences:")
print(train_and_evaluate_tfidf_logistic_regression(sentences_training_data, sentences_training_labels, sentences_test_data, sentences_test_labels))