In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from src.data.train_dataset_EDA import load_train_with_features, load_test_with_features
from src.features.tfidf import fit_tfidf_on_clean_text_column
from src.data.dataset import Label_map  


In [None]:
train_df = load_train_with_features()
test_df = load_test_with_features()

print("Train df:", train_df.shape)
print("Test df:", test_df.shape)

train_df.head()

In [None]:
train_counts = train_df["label"].value_counts().sort_index()
test_counts = test_df["label"].value_counts().sort_index()

labels = [f"{i}: {Label_map[i]}" for i in train_counts.index]

x = np.arange(len(labels))
width = 0.4

plt.figure()
plt.bar(x - width/2, train_counts.values, width, label="train")
plt.bar(x + width/2, test_counts.values, width, label="test")
plt.xticks(x, labels, rotation=30, ha="right")
plt.title("Class distribution: train vs test")
plt.ylabel("count")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
vectorizer, X_train, y_train = fit_tfidf_on_clean_text_column()

X_test_text = test_df["clean_text"]
y_test = test_df["label"].values
X_test = vectorizer.transform(X_test_text)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)


In [None]:
model = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    class_weight="balanced",
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).T
report_df


In [None]:
cm = confusion_matrix(y_test, y_pred, labels=sorted(Label_map.keys()))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[Label_map[i] for i in sorted(Label_map.keys())])

fig, ax = plt.subplots()
disp.plot(ax=ax, xticks_rotation=30)
plt.title("Confusion Matrix (Test)")
plt.tight_layout()
plt.show()


In [None]:
f1_per_class = [report_dict[str(i)]["f1-score"] for i in sorted(Label_map.keys())]
class_names = [Label_map[i] for i in sorted(Label_map.keys())]

x = np.arange(len(class_names))

plt.figure()
plt.bar(x, f1_per_class)
plt.xticks(x, class_names, rotation=30, ha="right")
plt.ylim(0, 1.0)
plt.title("Per-class F1 score (Test)")
plt.ylabel("F1 score")
plt.tight_layout()
plt.show()
