In [None]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from google.colab import files

# Upload files from local machine
uploaded = files.upload()

# Load the CSV files
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

# Label the data
true_df['label'] = 1
fake_df['label'] = 0

# Merge the DataFrames
df = pd.concat([true_df, fake_df], ignore_index=True)

# Shuffle the DataFrame to mix true and fake news
df = df.sample(frac=1).reset_index(drop=True)

# Install NLTK and download required resources
!pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define a cleaning function
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])
    return text

# Clean the text data
df['text'] = df['text'].apply(lambda x: clean_text(x) if isinstance(x, str) else x)

# Tokenize the text
df['tokens'] = df['text'].apply(lambda x: word_tokenize(x) if isinstance(x, str) else x)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words] if isinstance(x, list) else x)

# Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x] if isinstance(x, list) else x)

# Join the tokens back to strings if necessary
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Save the preprocessed DataFrame to a new CSV file
df.to_csv('preprocessed_data.csv', index=False)

# Download the file to local machine
files.download('preprocessed_data.csv')


Saving True.csv to True.csv
Saving Fake.csv to Fake.csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load the preprocessed data
df = pd.read_csv('preprocessed_data.csv')

# Split the dataset into train (70%), test (20%), and validation (10%)
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.33, stratify=temp_df['label'], random_state=42)

# Check the split sizes
print(f"Training set: {len(train_df)}, Testing set: {len(test_df)}, Validation set: {len(val_df)}")


Training set: 31428, Testing set: 9024, Validation set: 4446


In [None]:
# Fill missing values in 'processed_text' column
df['processed_text'].fillna('', inplace=True)

In [None]:
# Split the dataset into train (70%), test (20%), and validation (10%)
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.33, stratify=temp_df['label'], random_state=42)

# Fill missing values in 'processed_text' column for each split
train_df['processed_text'].fillna('', inplace=True)
test_df['processed_text'].fillna('', inplace=True)
val_df['processed_text'].fillna('', inplace=True)

# Check the split sizes
print(f"Training set: {len(train_df)}, Testing set: {len(test_df)}, Validation set: {len(val_df)}")


Training set: 31428, Testing set: 9024, Validation set: 4446


In [None]:
#Logistic Regression Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['processed_text'])
X_test = vectorizer.transform(test_df['processed_text'])
X_val = vectorizer.transform(val_df['processed_text'])

y_train = train_df['label']
y_test = test_df['label']
y_val = val_df['label']

# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

print("Logistic Regression - Training Classification Report:")
print(classification_report(y_train, y_pred_train))
print("Logistic Regression - Testing Classification Report:")
print(classification_report(y_test, y_pred_test))

# Confusion Matrix
print("Logistic Regression - Training Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))
print("Logistic Regression - Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

# Hyperparameter Tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
best_lr_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best_train = best_lr_model.predict(X_train)
y_pred_best_test = best_lr_model.predict(X_test)

print("Best Logistic Regression - Training Classification Report:")
print(classification_report(y_train, y_pred_best_train))
print("Best Logistic Regression - Testing Classification Report:")
print(classification_report(y_test, y_pred_best_test))

print("Best Logistic Regression - Training Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_best_train))
print("Best Logistic Regression - Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_test))


Logistic Regression - Training Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16436
           1       0.99      0.99      0.99     14992

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

Logistic Regression - Testing Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4720
           1       0.98      0.99      0.99      4304

    accuracy                           0.99      9024
   macro avg       0.99      0.99      0.99      9024
weighted avg       0.99      0.99      0.99      9024

Logistic Regression - Training Confusion Matrix:
[[16237   199]
 [  127 14865]]
Logistic Regression - Testing Confusion Matrix:
[[4654   66]
 [  45 4259]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Logistic Regression - Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

Best Logistic Regression - Testing Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4720
           1       0.99      0.99      0.99      4304

    accuracy                           0.99      9024
   macro avg       0.99      0.99      0.99      9024
weighted avg       0.99      0.99      0.99      9024

Best Logistic Regression - Training Confusion Matrix:
[[16436     0]
 [    1 14991]]
Best Logistic Regression - Testing Confusion Matrix:
[[4694   26]
 [  24 4280]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#LSTM Model
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Tokenize and pad the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['processed_text'])

X_train_seq = tokenizer.texts_to_sequences(train_df['processed_text'])
X_test_seq = tokenizer.texts_to_sequences(test_df['processed_text'])
X_val_seq = tokenizer.texts_to_sequences(val_df['processed_text'])

X_train_pad = pad_sequences(X_train_seq, maxlen=500)
X_test_pad = pad_sequences(X_test_seq, maxlen=500)
X_val_pad = pad_sequences(X_val_seq, maxlen=500)

y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])
y_val = np.array(val_df['label'])

# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=128, input_length=500))
lstm_model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val))

# Evaluate the model
y_pred_train = (lstm_model.predict(X_train_pad) > 0.5).astype("int32")
y_pred_test = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")

print("LSTM Model - Training Classification Report:")
print(classification_report(y_train, y_pred_train))
print("LSTM Model - Testing Classification Report:")
print(classification_report(y_test, y_pred_test))

# Confusion Matrix
print("LSTM Model - Training Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))
print("LSTM Model - Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Model - Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

LSTM Model - Testing Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4720
           1       1.00      1.00      1.00      4304

    accuracy                           1.00      9024
   macro avg       1.00      1.00      1.00      9024
weighted avg       1.00      1.00      1.00      9024

LSTM Model - Training Confusion Matrix:
[[16424    12]
 [    6 14986]]
LSTM Model - Testing Confusion Matrix:
[[4711    9]
 [   4 4300]]
