In [1]:
import pandas as pd
import joblib
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# Load dataset
data_fake = pd.read_csv('Fake.csv')
data_true = pd.read_csv('True.csv')

# Labeling data
data_fake['label'] = 0  # Fake news
data_true['label'] = 1  # Real news

# Combine datasets
data = pd.concat([data_fake, data_true], axis=0)
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle data

# Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['text'] = data['text'].apply(clean_text)

# Splitting dataset
x = data['text']
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Text vectorization (TF-IDF)
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# Train models
LR = LogisticRegression()
LR.fit(xv_train, y_train)

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

GB = GradientBoostingClassifier(random_state=0)
GB.fit(xv_train, y_train)

RF = RandomForestClassifier(random_state=0)
RF.fit(xv_train, y_train)

# Save models and vectorizer
joblib.dump(LR, "logistic_regression.pkl")
joblib.dump(DT, "decision_tree.pkl")
joblib.dump(GB, "gradient_boosting.pkl")
joblib.dump(RF, "random_forest.pkl")
joblib.dump(vectorization, "tfidf_vectorizer.pkl")

print("Models and vectorizer saved successfully!")

Models and vectorizer saved successfully!
