In [1]:
pip install requests beautifulsoup4





In [2]:
#!/usr/bin/env python
# coding: utf-8

# Required libraries
import requests
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

# Load and preprocess dataset
df = pd.read_csv('fake_or_real_news.csv')
df = df.set_index('Unnamed: 0')
y = df.label
df = df.drop('label', axis=1)
df

# Stratified sampling to maintain class balance in train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], y, test_size=0.33, random_state=53, stratify=y)

# Use TF-IDF vectorizer with bigrams
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42, C=1)
model.fit(tfidf_train, y_train)

def fetch_article_text(url):
    """Fetch the main text content from a news article URL."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])
    return article_text

def classify_article(url, threshold=0.4):
    """Classify a news article as FAKE or REAL given its URL, with a threshold adjustment."""
    article_text = fetch_article_text(url)
    article_vector = tfidf_vectorizer.transform([article_text])
    prob_fake = model.predict_proba(article_vector)[0][0]
    return "REAL" if prob_fake < threshold else "FAKE"

# Evaluate model on test set
pred_probs = model.predict_proba(tfidf_test)
pred = ["REAL" if prob[0] < 0.4 else "FAKE" for prob in pred_probs]
score = metrics.accuracy_score(y_test, pred)
print("Accuracy on test set: {:.3f}".format(score))

# Example URL dataset with known labels
url_data = [
    {"url": "https://www.gadgets360.com/ai/news/tsmc-shipment-sophgo-suspended-huawei-processor-6891763", "label": "REAL"},
    {"url": "https://www.gadgets360.com/ai/news/tsmc-shipment-sophgo-suspended-huawei-processor-6891763", "label": "FAKE"},
    # Add more URLs with known labels
]

# Calculate accuracy for URL-based predictions and print results
correct_predictions = 0

for entry in url_data:
    prediction = classify_article(entry["url"])
    print(f"URL: {entry['url']} - Predicted: {prediction}, Actual: {entry['label']}")
    if prediction == entry["label"]:
        correct_predictions += 1

url_accuracy = correct_predictions / len(url_data)
print(f"Accuracy on URL-based data: {url_accuracy:.3f}")


Accuracy on test set: 0.890
URL: https://www.gadgets360.com/ai/news/tsmc-shipment-sophgo-suspended-huawei-processor-6891763 - Predicted: FAKE, Actual: REAL
URL: https://www.gadgets360.com/ai/news/tsmc-shipment-sophgo-suspended-huawei-processor-6891763 - Predicted: FAKE, Actual: FAKE
Accuracy on URL-based data: 0.500
