<a href="https://colab.research.google.com/github/Sandesh816/Deep-Learning-Project/blob/main/News_Bias_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**NEWS BIAS DETECTOR**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Load Data**

In [None]:
filename = "allsides_balanced_news_headlines-texts.csv"

import requests
url = 'https://raw.githubusercontent.com/irgroup/Qbias/refs/heads/main/allsides_balanced_news_headlines-texts.csv'
res = requests.get(url, allow_redirects=True)
with open(filename,'wb') as file:
    file.write(res.content)

df = pd.read_csv(filename)
print("Shape:", df.shape)
print(df.head(10))
print(f"Columns: {df.columns}")

**Drop 'Unnamed: 0' column and reset index**

In [None]:
df.drop(columns= ["Unnamed: 0"], inplace = True)
df.reset_index(drop = True, inplace = True)
print(df.columns)

In [None]:
print(df.head())

**Data Exploration**

In [None]:
# Explore number of labeled articles in each bucket
left_df = df[df["bias_rating"] == "left"]
right_df = df[df["bias_rating"] == "right"]
center_df = df[df["bias_rating"] == "center"]

print(f"Left: {left_df.shape}")
print(f"Right: {right_df.shape}")
print(f"Center: {center_df.shape}")

In [None]:
# Explore word counts across articles
left_word_count = sum(left_df["text"].fillna("").str.split().apply(len))
right_word_count = sum(right_df["text"].fillna("").str.split().apply(len))
center_word_count = sum(center_df["text"].fillna("").str.split().apply(len))

print("Left Leaning Articles Word Count:", left_word_count)
print("Right Leaning Articles Word Count:", right_word_count)
print("Center Leaning Articles Word Count:", center_word_count)

There is a discrepancy in the total word count of the articles labeled left, right, and center

In [None]:
# Analyze the average lengths of left, right, and center leaning articles to check quality of dataset
average_length_left = left_word_count / left_df.shape[0]
average_length_right = right_word_count / right_df.shape[0]
average_length_center = center_word_count / center_df.shape[0]

print("Average length of left-leaning articles:", average_length_left)
print("Average length of right-leaning articles:", average_length_right)
print("Average length of center-leaning articles:", average_length_center)

In [None]:
# Explore tag lengths across articles
left_tags_count = sum(left_df["tags"].fillna("").str.split(",").apply(len))
right_tags_count = sum(right_df["tags"].fillna("").str.split(",").apply(len))
center_tags_count = sum(center_df["tags"].fillna("").str.split(",").apply(len))

print("Left Leaning Articles Tags Count:", left_tags_count)
print("Right Leaning Articles Tags Count:", right_tags_count)
print("Center Leaning Articles Tags Count:", center_tags_count)

Tags counts are closer

**Data Preparation**

In [None]:
# Convert words to lowercase in all columns
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
print(df.head())

In [None]:
# Shuffle df and split into X and Y
# all_indices = np.arange(df.shape[0])
# np.random.shuffle(all_indices)

# test_size = int(0.2 * df.shape[0])
# validation_size = int(0.1 * df.shape[0])

# test_indices = all_indices[: test_size]
# validation_indices = all_indices[test_size: test_size + validation_size]
# train_indices = all_indices[test_size + validation_size: ]

# X_train = df.iloc[train_indices].drop(columns = ["bias_rating"]).reset_index(drop = True)
# X_validation = df.iloc[validation_indices].drop(columns = ["bias_rating"]).reset_index(drop = True)
# X_test = df.iloc[test_indices].drop(columns = ["bias_rating"]).reset_index(drop = True)

# y_train = df.iloc[train_indices]["bias_rating"].reset_index(drop = True)
# y_validation = df.iloc[validation_indices]["bias_rating"].reset_index(drop = True)
# y_test = df.iloc[test_indices]["bias_rating"].reset_index(drop = True)

# print(f"X_train shape: {X_train.shape}")
# print(f"X_validation shape: {X_validation.shape}")
# print(f"X_test shape: {X_test.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"y_validation shape: {y_validation.shape}")
# print(f"y_test shape: {y_test.shape}")

# print(X_train.head())
# print(y_train.head())

In [None]:
# Shuffle df and split into X and Y
# df = df.sample(frac = 1) ## don't really need it as train_test_split will shuffle all rows
X = df[['title', 'heading', 'text']]
y = df["bias_rating"]
print(X.head())
print(y.head())

In [None]:
# Split into training, testing with even ratio between articles from each side (stratify)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42, shuffle= True)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42, shuffle= True)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"y_valid shape: {y_valid.shape}")

print(X_train.head(2))
print(y_train.head(2))

Resetting indices

In [None]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
X_valid = X_valid.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
y_valid = y_valid.reset_index(drop = True)

print(X_train.head(2))
print(y_train.head(2))



**Milestone 2**



In [None]:
import tensorflow as tf
import huggingface_hub
from transformers import AutoTokenizer, TFBertModel

In [None]:
# We will use the BERT model as our baseline model
model_name = "bert-base-uncased"
model = TFBertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Tokenize the first article's text for sanity check
test_tokens = tokenizer.tokenize(X_train["text"][0])
test_token_ids = tokenizer.convert_tokens_to_ids(test_tokens)
print(test_tokens)
print(test_token_ids)

In [None]:
# Analyze the token count across the articles after tokenizing all text
left_token_count = sum(len(tokenizer.tokenize(str(text))) for text in left_df["text"] if pd.notna(text))
right_token_count = sum(len(tokenizer.tokenize(str(text))) for text in right_df["text"] if pd.notna(text))
center_token_count = sum(len(tokenizer.tokenize(str(text))) for text in center_df["text"] if pd.notna(text))

print("Left-leaning articles token count:", left_token_count)
print("Right-leaning articles token count:", right_token_count)
print("Center-leaning articles token count:", center_token_count)

**Trying BERT Baseline Model**

We will use the process described by Keras to train on a portion of our data: https://keras.io/keras_hub/api/models/bert/bert_text_classifier/

In [None]:
import keras_hub
import tensorflow as tf
from tensorflow import keras
import numpy as np

# Grab our train data
features = X_train.copy()
features = features[:1000]
features = list(features["text"].astype(str))

# Grab our train labels and map string labels to numerical labels
label_mapping = {'left': 0, 'center': 1, 'right': 2}
labels = np.array([label_mapping[label] for label in y_train])
labels = labels[:1000]

# Pretrained classifier.
classifier = keras_hub.models.BertTextClassifier.from_preset(
    "bert_base_en_uncased",
    num_classes=3,
)
classifier.fit(x=features, y=labels, batch_size=2)
classifier.predict(x=features, batch_size=2)

# Re-compile (e.g., with a new learning rate).
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    jit_compile=True,
)
# Access backbone programmatically (e.g., to change `trainable`).
classifier.backbone.trainable = False
# Fit again.
classifier.fit(x=features, y=labels, batch_size=2)


BERT Baseline Model Evaluation

In [None]:
# Prepare data for evaluation
X_test_list = list(X_test["text"].astype(str))  # Convert X_test to a list of strings
y_test_mapped = np.array([label_mapping[label] for label in y_test])  # Map y_test labels

# Create a tf.data.Dataset for evaluation
eval_dataset = tf.data.Dataset.from_tensor_slices((X_test_list, y_test_mapped)).batch(2)

# Evaluate the model
loss, accuracy = classifier.evaluate(eval_dataset) # Use classifier instead of model
print(f"Loss: {loss}, Accuracy: {accuracy}")

from sklearn.metrics import classification_report

# Get predictions
preds = classifier.predict(eval_dataset) # Use classifier instead of model
y_pred = np.argmax(preds, axis=1)  # Get predicted labels

# Get true labels
y_true = np.concatenate([y for x, y in eval_dataset], axis=0)

print(classification_report(y_true, y_pred, target_names=['left', 'center', 'right']))

**Trying an LSTM custom model**

We also wanted to experiment with trying out using an LSTM that we built on our data.

In [None]:
from sklearn.model_selection import train_test_split
import keras_hub
import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import Counter
import huggingface_hub
from transformers import AutoTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

label_map = {"left": 0, "right": 1, "center": 2}
X_text = (df["title"].fillna("") + df["heading"].fillna("") + df["text"].fillna("")).to_list() # now, we get a list of strings as X_text
y = df["bias_rating"].str.lower().map(label_map)

max_length = 600
X = [
    tokenizer.encode(x, max_length=max_length, truncation=True, add_special_tokens= True) for x in X_text
]

X_pad = pad_sequences(X, maxlen=max_length, padding='post', truncating='post').astype("int32") # padding will allow us to send batches as tensor
y_np  = y.to_numpy(dtype="int32")

X_train, X_val, y_train, y_val = train_test_split(X_pad, y_np, test_size=0.2, stratify=y_np)

In [None]:
# creating the train and validation dataset
BATCH = 32
train_ds = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train))
      .shuffle(10_000)
      .batch(BATCH)
      .prefetch(tf.data.AUTOTUNE)
)

val_ds = (
    tf.data.Dataset.from_tensor_slices((X_val, y_val))
      .batch(BATCH)
      .prefetch(tf.data.AUTOTUNE)
)

In [None]:
# building our bidirectional LSTM model (embedding -> dropout -> biD -> dropout -> biD -> Dense)
vocab_size = tokenizer.vocab_size
num_classes = 3

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Bidirectional(LSTM(64, return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Bidirectional(LSTM(64)))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
es = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[es]
)

Above are our results from our initial test using an LSTM. The results seem to show that the model is indeed training, however the validation accuracy is not changing, which shows something is off. The model needs to be trained for many more epochs to achieve better accuracy.

**Evaluate on Testing Set**

In [None]:
# Get predictions
pred_probs = model.predict(X_val, batch_size=32)

y_pred = np.argmax(pred_probs, axis=1)
y_true = y_val

print(classification_report(y_true, y_pred,
                            target_names=['left', 'center', 'right']))

**Create requirements.txt**

In [None]:
!pip freeze > requirements.txt