In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download stopwords
nltk.download('stopwords')

# Sample Dataset (Replace with your actual dataset)
# Format should be: text content + label (0 = real, 1 = fake)
data = {
    'text': [
        'Scientists confirm climate change is real and accelerating',
        'Aliens land in New York, government confirms',
        'New study shows benefits of regular exercise',
        'President signs secret deal with foreign power',
        'COVID-19 vaccine proven safe in clinical trials',
        '5G networks spread coronavirus, experts claim'
    ],
    'label': [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Text preprocessing
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = text.split()
    # Remove stopwords and stem
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Split dataset
X = df['cleaned_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Example prediction
new_article = ["New research shows chocolate cures cancer"]
cleaned_article = clean_text(new_article[0])
article_tfidf = tfidf.transform([cleaned_article])
prediction = model.predict(article_tfidf)
print("\nPrediction for new article:", "Fake" if prediction[0] == 1 else "Real")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Desmond\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Accuracy: 0.5

Confusion Matrix:
 [[1 0]
 [1 0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Prediction for new article: Real


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
