In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [22]:
# Load the fake news dataset
fake_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
fake_data['label'] = 'FAKE'

# Load the real news dataset
true_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
true_data['label'] = 'REAL'

In [23]:
# Combine the datasets
data = pd.concat([fake_data, true_data], ignore_index=True)

In [24]:
# Split the dataset into training and testing sets
X = data['title'] + ' ' + data['text'] + ' ' + data['subject'] + ' ' + data['date']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Preprocess the text data using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [26]:
# Train a logistic regression model with epochs
epochs = 10
model = LogisticRegression(max_iter=epochs)
model.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [28]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9904231625835189


In [29]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[4692   41]
 [  45 4202]]


In [30]:
# Compute precision
precision = precision_score(y_test, y_pred, pos_label='FAKE')
print("Precision:", precision)

Precision: 0.9905003166561115


In [31]:
# Compute recall
recall = recall_score(y_test, y_pred, pos_label='FAKE')
print("Recall:", recall)

Recall: 0.9913374181280372


In [32]:
# New text to test
new_text = "Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had..."

In [33]:
# Preprocess the new text using TF-IDF vectorization
new_text_tfidf = tfidf_vectorizer.transform([new_text])

In [34]:
# Make predictions on the new text
prediction = model.predict(new_text_tfidf)

In [35]:
# Print the predicted label
print("Predicted Label:", prediction)

Predicted Label: ['FAKE']
