In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

target_names = [
    "Big Tech & Startups",
    "Science & Futuristic Technology",
    "Programming, Design & Data Science",
]

In [None]:
def load_datasets(prefix):
    train_df = pd.read_pickle(f'data/{prefix}_training.pkl')
    validation_df = pd.read_pickle(f'data/{prefix}_validation.pkl')
    test_df = pd.read_pickle(f'data/{prefix}_test.pkl')

    return train_df, validation_df, test_df

In [None]:
def get_texts_and_categories(dataframe):
    text_data =  dataframe['text'].tolist()
    category_data = dataframe['category_code'].tolist()
    return text_data, category_data

In [None]:
# Load the datasets
articles_train_df, articles_validation_df, articles_test_df = load_datasets("articles")
sentences_train_df, sentences_validation_df, sentences_test_df = load_datasets("sentences")

# Prepare the data and labels for articles
articles_training_data, articles_training_labels = get_texts_and_categories(articles_train_df)
articles_validation_data, articles_validation_labels = get_texts_and_categories(articles_validation_df)
articles_test_data, articles_test_labels = get_texts_and_categories(articles_test_df)

# Prepare the data and labels for sentences
sentences_training_data, sentences_training_labels = get_texts_and_categories(sentences_train_df)
sentences_validation_data, sentences_validation_labels = get_texts_and_categories(sentences_validation_df)
sentences_test_data, sentences_test_labels = get_texts_and_categories(sentences_test_df)

In [None]:
def train_and_evaluate_multinomial_nb(training_data, training_labels, test_data):
    vectorizer = CountVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    classifier = MultinomialNB()
    classifier.fit(training_features, training_labels)
    predictions = classifier.predict(test_features)

    return predictions

In [None]:
def train_and_evaluate_logistic_regression(training_data, training_labels, test_data):
    vectorizer = CountVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression())
    pipeline.fit(training_features, training_labels)
    predictions = pipeline.predict(test_features)

    return predictions

In [None]:
def train_and_evaluate_tfidf_logistic_regression(training_data, training_labels, test_data):
    vectorizer = TfidfVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    classifier = LogisticRegression()
    classifier.fit(training_features, training_labels)
    predictions = classifier.predict(test_features)

    return predictions

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(true_labels, predicted_labels, target_names, title):
    cm = confusion_matrix(true_labels, predicted_labels, labels=[i for i in range(len(target_names))])

    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=target_names, yticklabels=target_names, cmap="YlGnBu")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

In [None]:
from TLDR.dummy_predictor import DummyPredictor
dummy_predictor = DummyPredictor()

print("Baseline for articles:")
predictions_dummy_articles = dummy_predictor.evaluate(articles_test_data)
print(classification_report(articles_test_labels, predictions_dummy_articles, target_names=target_names))
plot_confusion_matrix(articles_test_labels, predictions_dummy_articles, target_names, title='Confusion Matrix for Baseline Articles')

In [None]:
print("MultinomialNB for articles:")
predictions_mnb_articles = train_and_evaluate_multinomial_nb(articles_training_data, articles_training_labels, articles_test_data)
print(classification_report(articles_test_labels, predictions_mnb_articles, target_names=target_names))
plot_confusion_matrix(articles_test_labels, predictions_mnb_articles, target_names, title='Confusion Matrix for MultinomialNB Articles')

In [None]:
print("Logistic Regression for articles:")
predictions_lr_articles = train_and_evaluate_logistic_regression(articles_training_data, articles_training_labels, articles_test_data)
print(classification_report(articles_test_labels, predictions_lr_articles, target_names=target_names))
plot_confusion_matrix(articles_test_labels, predictions_lr_articles, target_names, title='Confusion Matrix for Logistic Regression Articles')

In [None]:
print("TF-IDF Logistic Regression for articles:")
predictions_tfidf_lr_articles = train_and_evaluate_tfidf_logistic_regression(articles_training_data, articles_training_labels, articles_test_data)
print(classification_report(articles_test_labels, predictions_tfidf_lr_articles, target_names=target_names))
plot_confusion_matrix(articles_test_labels, predictions_tfidf_lr_articles, target_names, title='Confusion Matrix for TF-IDF Logistic Regression Articles')

In [None]:
print("Baseline for sentences:")
predictions_dummy_sentences = dummy_predictor.evaluate(sentences_test_data)
print(classification_report(sentences_test_labels, predictions_dummy_sentences, target_names=target_names))
plot_confusion_matrix(sentences_test_labels, predictions_dummy_sentences, target_names, title='Confusion Matrix for Baseline Sentences')

In [None]:
print("MultinomialNB for sentences:")
predictions_mnb_sentences = train_and_evaluate_multinomial_nb(sentences_training_data, sentences_training_labels, sentences_test_data)
print(classification_report(sentences_test_labels, predictions_mnb_sentences, target_names=target_names))
plot_confusion_matrix(sentences_test_labels, predictions_mnb_sentences, target_names, title='Confusion Matrix for MultinomialNB Sentences')

In [None]:
print("Logistic Regression for sentences:")
predictions_lr_sentences = train_and_evaluate_logistic_regression(sentences_training_data, sentences_training_labels, sentences_test_data)
print(classification_report(sentences_test_labels, predictions_mnb_sentences, target_names=target_names))
plot_confusion_matrix(sentences_test_labels, predictions_lr_sentences, target_names, title='Confusion Matrix for Logistic Regression Sentences')

In [None]:
print("TF-IDF Logistic Regression for sentences:")
predictions_tfidf_lr_sentences = train_and_evaluate_tfidf_logistic_regression(sentences_training_data, sentences_training_labels, sentences_test_data)
print(classification_report(sentences_test_labels, predictions_tfidf_lr_sentences, target_names=target_names))
plot_confusion_matrix(sentences_test_labels, predictions_tfidf_lr_sentences, target_names, title='Confusion Matrix for TF-IDF Logistic Regression Sentences')

In [None]:
def plot_performance_metrics(reports, model_names):
    metrics = ['precision', 'recall', 'f1-score']
    data = []

    for report in reports:
        scores = report['macro avg']
        data.append([scores[metric] for metric in metrics])

    df = pd.DataFrame(data, columns=metrics, index=model_names)
    ax = df.plot(kind='bar', figsize=(10, 6))
    plt.title('Performance Metrics Comparison')
    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.legend(loc='upper left')

    # Adjust y-axis limits
    plt.ylim(0.3, 1)

    plt.show()

In [None]:
report_dummy_articles = classification_report(articles_test_labels, predictions_dummy_articles, output_dict=True)
report_mnb_articles = classification_report(articles_test_labels, predictions_mnb_articles, output_dict=True)
report_lr_articles = classification_report(articles_test_labels, predictions_lr_articles, output_dict=True)
report_tfidf_lr_articles = classification_report(articles_test_labels, predictions_tfidf_lr_articles, output_dict=True)

report_dummy_sentences = classification_report(sentences_test_labels, predictions_dummy_sentences, output_dict=True)
report_mnb_sentences = classification_report(sentences_test_labels, predictions_mnb_sentences, output_dict=True)
report_lr_sentences = classification_report(sentences_test_labels, predictions_lr_sentences, output_dict=True)
report_tfidf_lr_sentences = classification_report(sentences_test_labels, predictions_tfidf_lr_sentences, output_dict=True)

articles_reports = [
    report_dummy_articles,
    report_mnb_articles,
    report_lr_articles,
    report_tfidf_lr_articles,
]

sentences_reports = [
    report_dummy_sentences,
    report_mnb_sentences,
    report_lr_sentences,
    report_tfidf_lr_sentences,
]

model_names = [
    'Baseline',
    'MultinomialNB',
    'Logistic Regression',
    'TF-IDF Logistic Regression',
]

In [None]:
plot_performance_metrics(articles_reports, model_names)

In [None]:
plot_performance_metrics(sentences_reports, model_names)