In [11]:
# Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample

# Load the df
set
df = pd.read_csv('news.csv')

# Drop unnecessary columns
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

# Remove duplicate rows
df = df.drop_duplicates()

# Preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

# Check class distribution
print("Class Distribution:")
print(df['label'].value_counts())

# Handle class imbalance
fake_news = df[df['label'] == 'FAKE']
real_news = df[df['label'] == 'REAL']

real_news_oversampled = resample(real_news,
                                 replace=True,
                                 n_samples=len(fake_news),
                                 random_state=42)

df_balanced = pd.concat([fake_news, real_news_oversampled])

# Update features and labels
X = df_balanced['cleaned_text']
y = df_balanced['label']

# Vectorize text
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Balanced Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# User input for fake news detection
def predict_news(news_text):
    processed_text = preprocess_text(news_text)
    vectorized_text = tfidf_vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)
    return prediction[0]

# Interactive prediction
while True:
    user_input = input("Enter a news article to check if it's FAKE or REAL (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break
    prediction = predict_news(user_input)
    print(f"The news article is predicted to be: {prediction}")

Class Distribution:
label
REAL    3154
FAKE    3152
Name: count, dtype: int64
Balanced Model Accuracy: 93.66%
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.93      0.94      0.94       631
        REAL       0.94      0.93      0.94       630

    accuracy                           0.94      1261
   macro avg       0.94      0.94      0.94      1261
weighted avg       0.94      0.94      0.94      1261

Confusion Matrix:
[[596  35]
 [ 45 585]]
Enter a news article to check if it's FAKE or REAL (or type 'exit' to quit): U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.  Kerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the