<a href="https://colab.research.google.com/github/Rstam59/ds-portfolio/blob/main/Real%26Fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json


In [None]:
# Make a directory for kaggle and move the file
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle CLI tool
!pip install -q kaggle


In [None]:
# Download the dataset using Kaggle CLI
!kaggle datasets download -d razanaqvi14/real-and-fake-news

# Unzip the dataset
!unzip real-and-fake-news.zip


In [None]:
import pandas as pd

real_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")


real_df['label'] = 1
fake_df['label'] = 0

df = pd.concat([real_df, fake_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data=df, x='label')
plt.title("Label Distribution (0: Fake, 1: Real)")
plt.show()

print(df['label'].value_counts(normalize=True))

In [None]:
df.isnull().sum()

In [None]:
df['text_len'] = df['text'].apply(lambda x: len(x.split()))
df['title_len'] = df['title'].apply(lambda x: len(x.split()))


plt.figure(figsize=(12, 5))
sns.histplot(df['text_len'], bins=50, kde=True)
plt.title("Text Length Distribution")
plt.xlabel("Word Count")
plt.show()

plt.figure(figsize=(12, 5))
sns.histplot(df['title_len'], bins=50, kde=True)
plt.title("Title Length Distribution")
plt.xlabel("Word Count")
plt.show()

In [None]:
print("Unique subjects:", df['subject'].unique())
print(df['subject'].value_counts())


In [None]:
from wordcloud import WordCloud

# Join all fake and real text
fake_text = " ".join(df[df['label']==0]['text'].astype(str).tolist())
real_text = " ".join(df[df['label']==1]['text'].astype(str).tolist())

plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
WordCloud(max_words=200, background_color="white").generate(fake_text)
plt.imshow(WordCloud().generate(fake_text), interpolation='bilinear')
plt.title("Fake News")
plt.axis("off")

plt.subplot(1, 2, 2)
WordCloud(max_words=200, background_color="white").generate(real_text)
plt.imshow(WordCloud().generate(real_text), interpolation='bilinear')
plt.title("Real News")
plt.axis("off")

plt.show()


#Vectorization

In [None]:
len(df)

In [None]:
import re
from sklearn.model_selection import train_test_split


df['content'] = df['title'] + ' ' + df['text']


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df['clean_content'] = df['content'].apply(clean_text)
df = df.drop_duplicates(subset='clean_content')

X_train, X_val, y_train, y_val = train_test_split(
    df['clean_content'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

In [None]:
from tensorflow.keras.layers import TextVectorization

MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 1200

vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH
)


vectorizer.adapt(X_train.values)


In [None]:
import tensorflow as tf

BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE


train_ds = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
val_ds = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))

train_ds = train_ds.shuffle(1024).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input

# Constants
EMBEDDING_DIM = 128

# Build model
model = Sequential([
    Input(shape=(1,), dtype=tf.string),
    vectorizer,
    Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Summary
model.summary()


In [None]:
EPOCHS = 5

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)


In [None]:
loss, acc = model.evaluate(val_ds)
print(f"\nValidation Accuracy: {acc:.4f}")
print(f"Validation Loss: {loss:.4f}")


In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
    plt.figure(figsize=(12, 5))

    # Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.title('Accuracy')
    plt.legend()

    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss')
    plt.legend()

    plt.show()

plot_history(history)


In [None]:
preds = model.predict(val_ds)
y_pred = (preds > 0.5).astype(int)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Fake", "Real"], yticklabels=["Fake", "Real"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
def predict_sample(text):
    text = clean_text(text)
    text = tf.constant([text])
    pred = model.predict(text)[0][0]
    label = "Real" if pred > 0.5 else "Fake"
    print(f"Prediction: {label} ({pred:.2f})")


predict_sample("President signs a new healthcare bill into law.")
predict_sample("BREAKING: Aliens seen entering the White House.")
predict_sample("The quick brown fox jumps over the lazy dog.")
predict_sample('state dept subpoenaed documents from clinton foundation report washington reuters us state department investigators last year issued a subpoena to the bill hillary and chelsea clinton foundation seeking documents about projects run by the charity that may have required us government approval when hillary clinton was secretary of state the washington post reported on thursday a us official said the matter was being investigated by the inspector general the state departments internal watchdog citing unnamed sources for the report the post said the subpoena issued in the fall also asked for records related to senior clinton aide huma abedin who for six months in simultaneously worked for several employers including the state department the foundation and clintons personal office the report follows a reuters investigation last year that found the clinton foundations flagship health project did not submit new or increased payments from at least seven foreign governments to the state department for review in breach of the ethics agreement clinton signed with the incoming obama administration in order to become secretary of state clinton who is running for the democratic nomination in the nov presidential election has been criticized for using a private email account hosted on a private computer while secretary of state from to a matter the fbi is investigating spokesmen for clintons campaign and the clinton foundation and a lawyer for abedin did not immediately respond to reuters requests for comment a spokesman for the inspector general also declined to comment the post quoted an unnamed foundation representative as saying the initial document request had been narrowed by investigators and that the foundation was not the focus of the probe it said there was no indication that the investigators were looking at clinton the full scope and status of the inquiry conducted by the state departments inspector general were not clear from the material correspondence reviewed by the washington post the paper said sources familiar with investigations into the controversy surrounding clintons private email server said they had no reason to believe any government agency was conducting any kind of inquiry into possible criminal violations related to the former secretary of state')

In [None]:
X_val.iloc[0]

In [None]:
y_val.iloc[0]

In [None]:
from transformers import AutoTokenizer

model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
!pip install --upgrade transformers datasets

In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train}))
val_ds = Dataset.from_pandas(pd.DataFrame({'text': X_val, 'label': y_val}))

In [None]:
train_ds

In [None]:
def tokenize(example):
    return tokenizer(example['text'], truncation=True, padding='longest')


train_ds = train_ds.map(tokenize, batched = True)
val_ds = val_ds.map(tokenize, batched = True)

In [None]:
train_ds.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
val_ds.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

In [None]:
train_ds = train_ds.remove_columns(['text', '__index_level_0__'])
val_ds = val_ds.remove_columns(['text', '__index_level_0__'])

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,  # Enable mixed precision
)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    compute_metrics = compute_metrics
)

trainer.train()

In [None]:
metrics = trainer.evaluate()
print(metrics)

# Save model
model.save_pretrained("fake-news-bert-model")
tokenizer.save_pretrained("fake-news-bert-model")


In [None]:
!pip install -q lime


In [None]:
from transformers import TextClassificationPipeline
from lime.lime_text import LimeTextExplainer
import torch

# Wrap the trained model into a pipeline
pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    top_k=None,
    device=0 if torch.cuda.is_available() else -1
)


In [None]:
explainer = LimeTextExplainer(class_names=["Fake", "Real"])

# Example input
text = "President Trump Said I love Unicorns and they are 100% Real!"

# Explain
exp = explainer.explain_instance(
    text_instance=text,
    classifier_fn=lambda x: np.array([[score['score'] for score in sorted(p, key=lambda y: y['label'])] for p in pipeline(x)]),
    num_features=10
)

# Show Explanation
exp.show_in_notebook(text=True)

In [None]:
from accelerate import Accelerator
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
from tqdm.auto import tqdm


accelerator = Accelerator()
device = accelerator.device

train_loader = DataLoader(train_ds, shuffle = True, batch_size = 16)
eval_loader = DataLoader(val_ds, batch_size = 16)

optimizer = AdamW(model.parameters(), lr = 2e-5)

model, optimizer, train_loader, eval_loader = accelerator.prepare(model,
                                                                  optimizer, train_loader, eval_loader)

model.train()
for epoch in range(3):
    loop = tqdm(train_loader, desc = f"Epoch {epoch}")
    for batch in loop:
        optimizer.zero_grad()
        batch['labels'] = batch.pop('label')
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()

    print(f"Epoch {epoch} loss: {loss.item()}")

In [None]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in eval_loader:
        batch['labels'] = batch.pop('label')
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim = -1).detach().cpu().numpy()
        labels = batch['labels'].detach().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)



acc = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {acc}")

In [None]:
!pip install nbstripout


In [None]:
import nbformat

# Path to your current notebook — this is how Colab stores it
notebook_path = "/content/Real&Fake_news.ipynb"  # ⬅️ Change this to match your filename

# Load the notebook
with open(notebook_path, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=nbformat.NO_CONVERT)

# OPTIONAL: keep outputs if you want them visible on GitHub
KEEP_OUTPUT = True

# Clean the notebook
for cell in nb.cells:
    if not KEEP_OUTPUT:
        cell["outputs"] = []
        cell["execution_count"] = None

# Remove broken widget metadata
nb.metadata.pop("widgets", None)

# Save cleaned version
cleaned_path = "/content/cleaned_notebook.ipynb"
with open(cleaned_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print(f"✅ Cleaned notebook saved to: {cleaned_path}")
