In [None]:

# Version 4 - Multiple models and evaluation
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('dataset.csv')

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'SGD Classifier': SGDClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f'--- {name} ---')
    print(classification_report(y_test, preds))

# Confusion matrix visualization for Logistic Regression
sns.heatmap(confusion_matrix(y_test, models['Logistic Regression'].predict(X_test)), annot=True, fmt='d')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()
