In [1]:
# 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 2. Load datasets
true_df = pd.read_csv("True.csv", usecols=["title"])
fake_df = pd.read_csv("Fake.csv", usecols=["title"])

true_df["label"] = 1
fake_df["label"] = 0

df = pd.concat([true_df, fake_df]).sample(frac=1).reset_index(drop=True)

print(df.head())

                                               title  label
0  REPUBLICAN SENATOR Sends Letter to FBI Directo...      0
1   Robert DeNiro Axes Anti-Vaxxer Movie From Tri...      0
2  Half of central Congo's 1.5 million displaced ...      1
3  Crucial details of Republican tax plan in flux...      1
4  FORMER ASST FBI DIRECTOR WARNS ANTI-TRUMP KABA...      0


In [3]:
# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(
    df["title"], df["label"], test_size=0.2, random_state=42
)

In [4]:
# 4. Define pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(solver="liblinear", max_iter=1000))
])

In [5]:
# 5. Define grid for hyperparameter tuning
param_grid = {
    "tfidf__max_df": [0.5, 0.7, 0.9],
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "clf__C": [0.1, 1, 10],
    "clf__penalty": ["l1", "l2"]
}

In [6]:
# 6. GridSearchCV to tune model
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,"{'clf__C': [0.1, 1, ...], 'clf__penalty': ['l1', 'l2'], 'tfidf__max_df': [0.5, 0.7, ...], 'tfidf__ngram_range': [(1, ...), (1, ...)]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [7]:
# 7. Best model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Best Parameters: {'clf__C': 10, 'clf__penalty': 'l2', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
✅ Accuracy: 0.9583704363312555

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      4593
           1       0.97      0.95      0.96      4391

    accuracy                           0.96      8984
   macro avg       0.96      0.96      0.96      8984
weighted avg       0.96      0.96      0.96      8984


Confusion Matrix:
 [[4448  145]
 [ 229 4162]]


In [8]:
# 8. Save final model and vectorizer
joblib.dump(best_model.named_steps["clf"], "improved_fake_news_model.pkl")
joblib.dump(best_model.named_steps["tfidf"], "improved_vectorizer.pkl")

['improved_vectorizer.pkl']

In [9]:
#Imports
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

#Loading data
true_df = pd.read_csv("True_augmented_authentic.csv")[["text"]]
fake_df = pd.read_csv("Fake_enhanced.csv")[["text"]]

true_df["label"] = 1
fake_df["label"] = 0

df = pd.concat([true_df, fake_df]).sample(frac=1).reset_index(drop=True)

#Text cleaning
def clean_text(text):
    text = re.sub(r"http\S+", "", text)                # Removes URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)            # Removes punctuation/numbers
    text = re.sub(r"\s+", " ", text)                   # Normalizes whitespace
    return text.lower()

df["text"] = df["text"].astype(str).apply(clean_text)

#Data splitting
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

#TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.5,
    min_df=5,
    max_features=30000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#Model: Voting Classifier
lr = LogisticRegression(C=10, penalty="l2", solver="liblinear", max_iter=1000)
nb = MultinomialNB()

voting_clf = VotingClassifier(estimators=[
    ("lr", lr),
    ("nb", nb)
], voting="soft")

voting_clf.fit(X_train_vec, y_train)
y_pred = voting_clf.predict(X_test_vec)

#Evaluation
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

#Saving model to import to VSCode
joblib.dump(voting_clf, "fake_news_voting_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

  fake_df = pd.read_csv("Fake_enhanced.csv")[["text"]]


✅ Accuracy: 0.9867586513853344

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4725
           1       0.99      0.99      0.99      4262

    accuracy                           0.99      8987
   macro avg       0.99      0.99      0.99      8987
weighted avg       0.99      0.99      0.99      8987

Confusion Matrix:
 [[4663   62]
 [  57 4205]]


['tfidf_vectorizer.pkl']