In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
with open('/content/fake.txt') as f:
    fake_news = f.readlines()

with open('/content/real.txt') as f:
    real_news = f.readlines()

# Create DataFrame
data_fake = pd.DataFrame(fake_news, columns=['text'])
data_fake['label'] = 0
data_real = pd.DataFrame(real_news, columns=['text'])
data_real['label'] = 1

data = pd.concat([data_fake, data_real], ignore_index=True)

# Preprocess the text data
def preprocess(text):

    return text.strip()

data['text'] = data['text'].apply(preprocess)

# Generate TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['text']).toarray()
y = data['label']


In [7]:
# Split the data (random and stratified)
X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the models
models = {
    'GaussianNB': GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
    'RandomForest': RandomForestClassifier(),
    'SVM_linear': SVC(kernel='linear'),
    'SVM_rbf': SVC(kernel='rbf')
}

# Function to train and evaluate models
def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy
        print(f'{name} Accuracy: {accuracy}')
        print(classification_report(y_test, y_pred))
    return results

print("Random Split Evaluation:")
evaluate_models(models, X_train_rand, X_test_rand, y_train_rand, y_test_rand)

print("\nStratified Split Evaluation:")
evaluate_models(models, X_train_strat, X_test_strat, y_train_strat, y_test_strat)

# Grid Search for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_strat, y_train_strat)
print(f'Best parameters for Random Forest: {grid_search.best_params_}')

# 10-fold cross-validation (random and stratified)
def cross_validate_models(models, X, y):
    for name, model in models.items():
        # Random 10-fold CV
        scores_random = cross_val_score(model, X, y, cv=10)
        print(f'{name} 10-fold CV Random Accuracy: {np.mean(scores_random)}')

        # Stratified 10-fold CV
        skf = StratifiedKFold(n_splits=10)
        scores_stratified = cross_val_score(model, X, y, cv=skf)
        print(f'{name} 10-fold CV Stratified Accuracy: {np.mean(scores_stratified)}')

print("\nCross-Validation Evaluation:")
cross_validate_models(models, X, y)


Random Split Evaluation:
GaussianNB Accuracy: 0.7357142857142858
              precision    recall  f1-score   support

           0       0.67      0.73      0.70       414
           1       0.79      0.74      0.76       566

    accuracy                           0.74       980
   macro avg       0.73      0.73      0.73       980
weighted avg       0.74      0.74      0.74       980

MultinomialNB Accuracy: 0.7918367346938775
              precision    recall  f1-score   support

           0       0.89      0.58      0.70       414
           1       0.75      0.95      0.84       566

    accuracy                           0.79       980
   macro avg       0.82      0.76      0.77       980
weighted avg       0.81      0.79      0.78       980

BernoulliNB Accuracy: 0.8346938775510204
              precision    recall  f1-score   support

           0       0.94      0.65      0.77       414
           1       0.79      0.97      0.87       566

    accuracy                     