<a href="https://colab.research.google.com/github/Pavan-santhosh-ips/Fake_News_Detection/blob/main/FakeNewsDetectionNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [None]:
pip install --upgrade datasets



In [None]:
import pandas as pd
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("news_articles.csv")

Saving news_articles.csv to news_articles.csv


**USING DISTILBERT FOR SEQUENTIAL CLASSIFICATION**

In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from evaluate import load




# Convert labels to binary (1 = Real, 0 = Fake)
df['label'] = df['label'].apply(lambda x: 1 if str(x).lower() == 'real' else 0)

# Combine title and text for better context
df['content'] = df['title'] + " " + df['text']

# Split dataset
train_texts = list(df['content'].astype(str))
train_labels = list(df['label'])

test_texts = list(df['content'].astype(str))
test_labels = list(df['label'])


# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Define PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure correct dtype
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

# Define GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size for faster training
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

# Load evaluation metric
metric = load("accuracy")

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Evaluate model
eval_result = trainer.evaluate()
print("Evaluation Results:", eval_result)

# Save model
model.save_pretrained("fake_news_model")
tokenizer.save_pretrained("fake_news_model")

# Inference function
def predict_fake_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()
    return "Real" if prediction == 1 else "Fake"

# Test on a sample
sample_text = "Breaking: Scientists discover a new planet similar to Earth!"
print("Prediction:", predict_fake_news(sample_text))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msadiqsahilbaigmogal2004[0m ([33msadiqsahilbaigmogal2004-vit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5624,0.425732,0.819656
2,0.4805,0.234938,0.90792
3,0.1697,0.143192,0.95229


Evaluation Results: {'eval_loss': 0.14319203794002533, 'eval_accuracy': 0.9522900763358778, 'eval_runtime': 30.1762, 'eval_samples_per_second': 69.459, 'eval_steps_per_second': 8.682, 'epoch': 3.0}
Prediction: Fake


**USING SVM FOR SEQUENTIAL CLASSIFICATION**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score


# Step 2: Check label distribution

# Step 3: Encode labels ('Fake' = 0, 'Real' = 1)
df = df[df['label'].isin(['Fake', 'Real'])]

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])


# Step 4: Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'],
    test_size=0.2,
    random_state=42
)

# Confirm label balance in train set)

# Combine X_train and y_train into a single DataFrame
train_df = pd.DataFrame({'text': X_train, 'label': y_train}).dropna()
test_df = pd.DataFrame({'text': X_test, 'label': y_test}).dropna()

X_train = train_df['text']
y_train = train_df['label']
X_test = test_df['text']
y_test = test_df['label']

# Step 5: TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 6: Train SVM model
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

# Step 7: Predict and evaluate
y_pred = svm_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label_encoded'] = label_encoder.fit_transform(df['label'])


              precision    recall  f1-score   support

        Fake       0.77      0.86      0.81       248
        Real       0.74      0.60      0.66       162

    accuracy                           0.76       410
   macro avg       0.75      0.73      0.74       410
weighted avg       0.76      0.76      0.75       410

Accuracy: 0.7585


**USING SVM AND RANDOM FOREST AS ENSEMBLE MODEL**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score


# Step 2: Filter valid labels
df = df[df['label'].isin(['Fake', 'Real'])]

# Step 3: Drop rows with missing text
df.dropna(subset=['text', 'label'], inplace=True)

# Step 4: Encode labels ('Fake'=0, 'Real'=1)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'],
    test_size=0.2, random_state=42, stratify=df['label_encoded']
)

# Step 6: TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 7: Define individual models
svm = LinearSVC()
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 8: Combine models using VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('rf', rf)
    ],
    voting='hard'  # can use 'soft' if using probabilistic models like Logistic Regression
)

# Step 9: Train ensemble model
ensemble_model.fit(X_train_tfidf, y_train)

# Step 10: Evaluate model
y_pred = ensemble_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

        Fake       0.72      0.98      0.83       259
        Real       0.90      0.36      0.51       151

    accuracy                           0.75       410
   macro avg       0.81      0.67      0.67       410
weighted avg       0.79      0.75      0.71       410

Accuracy: 0.7488


**USING ONLY XG-BOOST**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier


# Step 2: Remove rows with missing text or labels
df = df.dropna(subset=['text', 'label'])

# Step 3: Label encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'],
    test_size=0.2, stratify=df['label_encoded'], random_state=42
)

# Step 5: TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 6: XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train)

# Step 7: Predict & evaluate
y_pred = xgb_model.predict(X_test_tfidf)

# Step 8: Classification Report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Step 9: Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

        Fake       0.79      0.83      0.81       259
        Real       0.68      0.62      0.65       151

    accuracy                           0.75       410
   macro avg       0.73      0.72      0.73       410
weighted avg       0.75      0.75      0.75       410

Accuracy: 0.7512


**XGBOOST AND NAIVE BAYES USED AS ENSEMBLE MODEL**

In [None]:
pip install xgboost scikit-learn pandas



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load and clean data
#df = pd.read_csv('/mnt/data/news_articles.csv')
df = df.dropna(subset=['text', 'label'])

# Step 2: Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'],
    test_size=0.2,
    stratify=df['label_encoded'],
    random_state=42
)

# Step 4: TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 5: Define base models
nb_model = MultinomialNB()
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Step 6: Ensemble with Voting Classifier (soft voting)
ensemble_model = VotingClassifier(
    estimators=[
        ('naive_bayes', nb_model),
        ('xgboost', xgb_model)
    ],
    voting='soft'  # soft voting uses predict_proba
)

# Step 7: Train ensemble
ensemble_model.fit(X_train_tfidf, y_train)

# Step 8: Predict & evaluate
y_pred = ensemble_model.predict(X_test_tfidf)

# Step 9: Output
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

        Fake       0.75      0.95      0.83       259
        Real       0.83      0.45      0.58       151

    accuracy                           0.76       410
   macro avg       0.79      0.70      0.71       410
weighted avg       0.78      0.76      0.74       410

Accuracy: 0.7634


**USING LSTM AND CUSTOM EMBEDDINGS FOR SEQUENTIAL DATA**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Step 1: Load and clean data
#df = pd.read_csv('/mnt/data/news_articles.csv')
df = df.dropna(subset=['text', 'label'])

# Step 2: Label encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'], test_size=0.2,
    stratify=df['label_encoded'], random_state=42
)

# Step 4: Tokenization and padding
max_words = 10000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Step 5: Build model with trainable embeddings
model = Sequential([
    Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 6: Train
model.fit(X_train_pad, y_train, batch_size=32, epochs=5, validation_split=0.1)

# Step 7: Evaluate
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype("int32")

print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label_encoded'] = label_encoder.fit_transform(df['label'])


Epoch 1/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 1s/step - accuracy: 0.6099 - loss: 0.6702 - val_accuracy: 0.7012 - val_loss: 0.5886
Epoch 2/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 1s/step - accuracy: 0.7654 - loss: 0.4834 - val_accuracy: 0.6707 - val_loss: 0.6423
Epoch 3/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 933ms/step - accuracy: 0.9267 - loss: 0.2213 - val_accuracy: 0.6951 - val_loss: 0.8183
Epoch 4/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 932ms/step - accuracy: 0.9673 - loss: 0.1060 - val_accuracy: 0.6951 - val_loss: 0.9000
Epoch 5/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 941ms/step - accuracy: 0.9845 - loss: 0.0563 - val_accuracy: 0.7134 - val_loss: 0.8666
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 182ms/step
Classification Report:
               precision    recall  f1-score   support

        Fake       0.78      0.79      0.7