In [7]:
# Install required libraries (run only if not installed)
!pip install pandas scikit-learn nltk

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords

# -----------------------------
# Step 1: Load CSV files directly
# -----------------------------
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

# Add a 'label' column: fake = 1, real = 0
fake_df['label'] = 1
true_df['label'] = 0

# Combine both datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Display first 5 rows
print(df.head())

# -----------------------------
# Step 2: Clean text
# -----------------------------
def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['text'].apply(clean_text)

# -----------------------------
# Step 3: Train/Test split
# -----------------------------
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# Step 4: Vectorize text
# -----------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# -----------------------------
# Step 5: Train ML model
# -----------------------------
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# -----------------------------
# Step 6: Evaluate model
# -----------------------------
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# -----------------------------
# Step 7: Test new article
# -----------------------------
def predict_fake_news(article):
    article_clean = clean_text(article)
    vec = vectorizer.transform([article_clean])
    pred = model.predict(vec)
    return "Fake News" if pred[0] == 1 else "Real News"

# Example
new_article = "President announces new healthcare policy amid controversy."
print("Prediction:", predict_fake_news(new_article))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  label  
0  December 31, 2017      1  
1  December 31, 2017      1  
2  December 30, 2017      1  
3  December 29, 2017      1  
4  December 25, 2017      1  
Accuracy: 0.9889755011135858

Confusion Matrix:
 [[4199   48]
 [  51 4682]]

Classification Report