In [None]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB  # Import MultinomialNB
from sklearn.svm import SVC  # Import SVC
from sklearn.linear_model import LogisticRegression


In [None]:
fake = pd.read_csv("/content/Fake.csv")
real = pd.read_csv("/content/True.csv")


In [None]:
fake['label'] = 1
real['label'] = 0


In [None]:
df = pd.concat([fake, real], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
}

In [None]:
param_grids = {
    'Logistic Regression': {
        'clf__C': [0.1, 1, 10]
    },
    'Naive Bayes': {
        'clf__alpha': [0.5, 1.0, 1.5]
    }
}

In [None]:

!pip install scikit-learn
# Import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV # Make sure GridSearchCV is also imported


In [None]:
for name, model in models.items():
    print(f"\n🔍 Training and tuning {name}...")
    pipeline = Pipeline([
        # ('tfidf', TfidfVectorizer(max_features=5000)),  # Remove this line
        ('clf', model)
    ])

    # Update param_grid to reflect the removal of 'tfidf'
    grid_params = param_grids[name]
    # grid_params['tfidf__max_features'] = [5000]  #  If you want to tune max_features

    grid = GridSearchCV(pipeline, grid_params, cv=3, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)  # X_train is already transformed

    print(f"✅ Best Parameters for {name}: {grid.best_params_}")
    y_pred = grid.predict(X_test)

    print(f"\n📊 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
    print("🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define pipeline without TfidfVectorizer
rf_pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

In [None]:
rf_params = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}



In [None]:
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=3, n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)

# Evaluate the model
print("\n🌲 Random Forest Classifier Results:")
print(f"Best Params: {rf_grid.best_params_}")
y_pred_rf = rf_grid.predict(X_test)


In [None]:

print("📊 Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_rf))
print("🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

In [None]:
import joblib

# Save
joblib.dump(rf_grid.best_estimator_, "best_fake_news_model.pkl")

# Load later
# model = joblib.load("best_fake_news_model.pkl")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest Confusion Matrix')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest Confusion Matrix')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest Confusion Matrix')
plt.show()


In [None]:
df.to_csv("cleaned_fake_news.csv", index=False)


In [None]:
results_df = pd.DataFrame({
    'Text': X_test,
    'Actual Label': y_test,
    'Predicted Label': y_pred_rf
})
results_df.to_csv("model_results.csv", index=False)


In [None]:
from google.colab import files
files.download('model_results.csv')  # triggers download popup


In [None]:
df = pd.read_csv('Fake.csv')
# ... cleaning steps ...
df['text'] = df['text'].apply(clean_text)  #

In [None]:
df.to_csv('cleaned_fake_news.csv', index=False)

In [None]:
from google.colab import files
files.download('cleaned_fake_news.csv')