In [1]:
import os
import json
import pandas as pd

root_dir = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\BASIL\annotations"
data = []

# Loop through all year folders (2010-2019)
for folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            json_path = os.path.join(folder_path, filename)
            with open(json_path, "r", encoding="utf-8") as f:
                article = json.load(f)

                # Process phrase-level annotations
                for phrase in article.get("phrase-level-annotations", []):
                    text = phrase.get("txt", "").strip()
                    bias = phrase.get("bias", "").lower()
                    
                    # Binary label: 1 = biased, 0 = neutral
                    label = 0 if bias in ["", "none"] else 1

                    if text:
                        data.append({"text": text, "label": label})

# Convert to DataFrame and save
df = pd.DataFrame(data)
df = df.drop_duplicates()
df.to_csv("basil_phrase_level_binary.csv", index=False)
print(f"✅ Extracted {len(df)} labeled phrases and saved to basil_phrase_level_binary.csv")


✅ Extracted 1643 labeled phrases and saved to basil_phrase_level_binary.csv


In [4]:
import os
import json
import pandas as pd

article_dir = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\BASIL\articles"
annotation_dir = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\BASIL\annotations"

neutral_data = []
max_neutral = 1600

# Walk through year folders
for year in os.listdir(article_dir):
    year_article_path = os.path.join(article_dir, year)
    year_annotation_path = os.path.join(annotation_dir, year)

    if not os.path.isdir(year_article_path):
        continue

    for file in os.listdir(year_article_path):
        if not file.endswith(".json"):
            continue

        article_path = os.path.join(year_article_path, file)
        annotation_path = os.path.join(year_annotation_path, file)

        # Load article text
        with open(article_path, "r", encoding="utf-8") as f:
            article = json.load(f)

        # Flatten lines from all paragraphs
        lines = [line.strip() for para in article.get("body-paragraphs", []) for line in para if line.strip()]

        # Load annotation and collect all biased lines
        annotated_lines = set()
        if os.path.exists(annotation_path):
            with open(annotation_path, "r", encoding="utf-8") as f:
                annotations = json.load(f)
                annotated_lines = set(p.get("txt", "").strip() for p in annotations.get("phrase-level-annotations", []))

        # Add non-annotated lines
        for line in lines:
            if len(neutral_data) >= max_neutral:
                break
            if line not in annotated_lines:
                neutral_data.append({"text": line, "label": 0})

        if len(neutral_data) >= max_neutral:
            break

# Save to CSV
df_neutral = pd.DataFrame(neutral_data).drop_duplicates().head(max_neutral)
df_neutral.to_csv("basil_neutral_lines_1600.csv", index=False)
print(f"✅ Saved {len(df_neutral)} unbiased lines to basil_neutral_lines_1600.csv")


✅ Saved 1589 unbiased lines to basil_neutral_lines_1600.csv


In [5]:
import pandas as pd

# Paths to your files
biased_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\basil_phrase_level_binary.csv"
neutral_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\basil_neutral_lines_1600.csv"

# Load both datasets
df_biased = pd.read_csv(biased_path)
df_neutral = pd.read_csv(neutral_path)

# Optional: keep columns consistent
df_biased = df_biased[['text', 'label']]
df_neutral = df_neutral[['text', 'label']]

# Combine and shuffle
df_combined = pd.concat([df_biased, df_neutral], ignore_index=True).drop_duplicates()
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save combined dataset
output_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\basil_combined_dataset.csv"
df_combined.to_csv(output_path, index=False)

print(f"✅ Combined dataset saved to:\n{output_path}")
print(f"Total rows: {len(df_combined)} | Biased: {df_combined['label'].sum()} | Neutral: {(df_combined['label'] == 0).sum()}")

✅ Combined dataset saved to:
C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\basil_combined_dataset.csv
Total rows: 3232 | Biased: 1643 | Neutral: 1589


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load your dataset
data = pd.read_csv(r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\BASIL\political_final_data.csv")

# Drop any rows with missing values
data = data.dropna(subset=["text", "label"])

# Ensure labels are int
data["label"] = data["label"].astype(int)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(data)

# Train/test split
split_dataset = dataset.train_test_split(test_size=0.2)

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# Load DistilBERT for binary classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define metrics (accuracy + precision/recall/f1)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results_bias_binary",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)


Map:   0%|          | 0/2585 [00:00<?, ? examples/s]

Map:   0%|          | 0/647 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.356366,0.834621,0.813264,0.906615,0.737342
2,No log,0.342302,0.850077,0.831304,0.92278,0.756329
3,No log,0.354399,0.854714,0.840136,0.908088,0.781646
4,No log,0.36343,0.853168,0.848,0.857605,0.838608
5,No log,0.371555,0.853168,0.842454,0.885017,0.803797


Evaluation Results: {'eval_loss': 0.34230175614356995, 'eval_accuracy': 0.8500772797527048, 'eval_f1': 0.831304347826087, 'eval_precision': 0.9227799227799228, 'eval_recall': 0.7563291139240507, 'eval_runtime': 2.2183, 'eval_samples_per_second': 291.662, 'eval_steps_per_second': 9.467, 'epoch': 5.0}


In [20]:
model.save_pretrained("./political_bias_model")
tokenizer.save_pretrained("./political_bias_tokenizer")

('./political_bias_tokenizer\\tokenizer_config.json',
 './political_bias_tokenizer\\special_tokens_map.json',
 './political_bias_tokenizer\\vocab.txt',
 './political_bias_tokenizer\\added_tokens.json',
 './political_bias_tokenizer\\tokenizer.json')