In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
from utils.text_preprocessing import clean_text
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('data/fake_or_real_news.csv')
df['text'] = df['text'].apply(clean_text)

X = df['text']
y = df['label'].apply(lambda x: 1 if x == 'REAL' else 0)  # 1 for real, 0 for fake

tfidf = TfidfVectorizer(max_df=0.7)
X_vec = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

joblib.dump(model, 'models/logistic_model.pkl')
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


ModuleNotFoundError: No module named 'utils'