In [1]:
import os
import random

json_dir = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\Article-Bias-Prediction\data\jsons"
max_files = 10_000

# Get all .json file paths
json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
total_files = len(json_files)

print(f"📦 Found {total_files} JSON files.")

# Only delete if over the limit
if total_files > max_files:
    files_to_delete = random.sample(json_files, total_files - max_files)

    for filename in files_to_delete:
        file_path = os.path.join(json_dir, filename)
        os.remove(file_path)

    print(f"Deleted {len(files_to_delete)} files to limit the dataset to {max_files} files.")
else:
    print("No need to delete files. Dataset is within the 10,000 file limit.")


📦 Found 10000 JSON files.
No need to delete files. Dataset is within the 10,000 file limit.


In [2]:
import json
import pandas as pd
import os

json_dir = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\Article-Bias-Prediction\data\jsons"

# Map textual bias to numeric labels
bias_map = {
    "left": 0,
    "center": 1,
    "right": 2
}

data = []

# Loop through all JSON files
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        with open(os.path.join(json_dir, filename), "r", encoding="utf-8") as f:
            article = json.load(f)
            content = article.get("content", "").strip()
            bias_text = article.get("bias_text", "").lower().strip()

            if content and bias_text in bias_map:
                data.append({
                    "text": content,
                    "label": bias_map[bias_text]
                })

# Create and save DataFrame
df = pd.DataFrame(data)
output_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\article_bias_clean_numeric.csv"
df.to_csv(output_path, index=False)

print(f"✅ Saved {len(df)} cleaned articles with numeric labels to:\n{output_path}")
print(df['label'].value_counts().sort_index())


✅ Saved 10000 cleaned articles with numeric labels to:
C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\article_bias_clean_numeric.csv
label
0    3436
1    2928
2    3636
Name: count, dtype: int64


In [3]:
print(df.head(3))
print(df.columns)

                                                text  label
0  Besides his most recent trip to Quetta , Mr. R...      0
1  Wall Street Journal economics expert Stephen M...      2
2  The left believes Sanders ’ s chances have imp...      1
Index(['text', 'label'], dtype='object')


In [4]:
import pandas as pd
import re

# Load your CSV
data = pd.read_csv(r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\Article-Bias-Prediction\political_LNR_final_data.csv")

# Show basic info
print("Before cleaning:")
print(data.info())
print(data["label"].value_counts())

# Drop rows with missing text or labels
data = data.dropna(subset=["text", "label"])

# Drop duplicates
data = data.drop_duplicates(subset=["text"])

# Ensure labels are in {0, 1, 2}
data = data[data["label"].isin([0, 1, 2])]

# Optional: Light text cleaning
def clean_text(text):
    text = str(text)
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^A-Za-z0-9.,!?\"\'\s]", "", text)  # Keep common chars
    return text.strip()

data["text"] = data["text"].apply(clean_text)

# Remove very short texts (less than 5 words)
data["text_length"] = data["text"].apply(lambda x: len(x.split()))
data = data[data["text_length"] >= 5]

# Drop the text_length column (not needed for training)
data = data.drop(columns=["text_length"])

# Reset index
data = data.reset_index(drop=True)

# Final status
print("\nAfter cleaning:")
print(data.info())
print(data["label"].value_counts())

# Optional: Save cleaned version
data.to_csv(r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\article_bias_clean_ready.csv", index=False)


Before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10000 non-null  object
 1   label   10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None
label
2    3636
0    3436
1    2928
Name: count, dtype: int64

After cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10000 non-null  object
 1   label   10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None
label
2    3636
0    3436
1    2928
Name: count, dtype: int64


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

data = pd.read_csv(r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\Article-Bias-Prediction\political_bias_LNR_data.csv")  


dataset = Dataset.from_pandas(data)


split_dataset = dataset.train_test_split(test_size=0.2)

# tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

#  tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Apply the tokenization across the dataset
tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# Load pre-trained  model for multi-class classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_bias_binary",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",     
    logging_steps=10,                
    report_to="none",  
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True
)


# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9007,0.833056,0.627
2,0.7415,0.734979,0.688
3,0.5166,0.707573,0.713
4,0.5685,0.722623,0.7055
5,0.4742,0.708337,0.717


Evaluation Results: {'eval_loss': 0.7075727581977844, 'eval_accuracy': 0.713, 'eval_runtime': 13.0853, 'eval_samples_per_second': 152.843, 'eval_steps_per_second': 9.553, 'epoch': 5.0}


In [None]:
model.save_pretrained("./political_bias_LNR_model")
tokenizer.save_pretrained("./political_bias_LNR_tokenizer")

('./my_political_bias_LNR_tokenizer\\tokenizer_config.json',
 './my_political_bias_LNR_tokenizer\\special_tokens_map.json',
 './my_political_bias_LNR_tokenizer\\vocab.txt',
 './my_political_bias_LNR_tokenizer\\added_tokens.json',
 './my_political_bias_LNR_tokenizer\\tokenizer.json')