In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
import re
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Load the data
fake_news_data = pd.read_csv('data/fake.csv')
true_news_data = pd.read_csv('data/true.csv')

# Add labels
fake_news_data['label'] = 1
true_news_data['label'] = 0

In [3]:
# Visualize data

'''
print(fake_news_data.head())
print(true_news_data.head())
'''

'\nprint(fake_news_data.head())\nprint(true_news_data.head())\n'

In [4]:
# Merge data
news_merged = pd.concat([fake_news_data, true_news_data])

#print(news_merged.head())

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

news_merged['clean_text'] = news_merged['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#print(news_merged.head())
print(news_merged.shape)

(44898, 6)


In [7]:
# Shuffle data
news_merged = shuffle(news_merged, random_state=0)

#print(news_merged.head())

X_train = news_merged.iloc[:, 5]
Y_train = news_merged.iloc[:, 4]

#print(X_train, Y_train)


In [8]:
# Vectorization
vectorization = TfidfVectorizer() 
X_train = vectorization.fit_transform(X_train)

In [9]:
# Splitting data into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

In [10]:
# Ensure Y_train and Y_test are numpy arrays
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

In [11]:
# Logistic Regression
model = LogisticRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)


In [12]:
#Accuracy, precision and error

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
error = np.mean(np.abs(Y_pred - Y_test)) / np.mean(Y_test)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Error: {error}")

Accuracy: 0.9860801781737194
Precision: 0.9864777849323889
Error: 0.026835551738943756
