In [106]:
# install dependencies (run in a notebook or terminal)
!pip install --quiet scikit-learn pandas matplotlib seaborn joblib

# 1. Load & split
import pandas as pd
from sklearn.model_selection import train_test_split

URL = "https://huggingface.co/datasets/mrm8488/fake-news/resolve/main/fake_news.csv"
df = pd.read_csv(URL)  # columns: ['text','label'] where label is 0=REAL,1=FAKE

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.1,
    stratify=df["label"],
    random_state=42
)

# 2. Vectorize with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words="english"
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

# 3. Hyperparameter tuning with GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = {"C":[0.01, 0.1, 1, 10]}
clf = GridSearchCV(
    LogisticRegression(
        solver="liblinear",
        max_iter=1000
    ),
    param_grid,
    cv=5,
    scoring="f1",
    verbose=1
)
clf.fit(X_train_vec, y_train)

print("Best C:", clf.best_params_["C"])
print("Best CV F1:", clf.best_score_)

# 4. Final evaluation
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, digits=4))

# 5. Save model + vectorizer
import joblib
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(clf.best_estimator_, "logreg_model.pkl")


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best C: 10
Best CV F1: 0.9940844729105451
              precision    recall  f1-score   support

           0     0.9935    0.9916    0.9925      2142
           1     0.9923    0.9940    0.9932      2348

    accuracy                         0.9929      4490
   macro avg     0.9929    0.9928    0.9929      4490
weighted avg     0.9929    0.9929    0.9929      4490



['logreg_model.pkl']

In [109]:
from google.colab import files

# Download the vectorizer
files.download("tfidf_vectorizer.pkl")

# Download the trained model
files.download("logreg_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [110]:
import pandas as pd

# tras hacer clf.fit(X_train_vec, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.to_csv("cv_results.csv", index=False)


In [111]:
cv_results = pd.read_csv("cv_results.csv")


In [112]:
from google.colab import files
files.download("cv_results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>