In [14]:
import pandas as pd

# Load the full dataset
file_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\GenderStance\GenderStance_cleaned.csv"
df = pd.read_csv(file_path)

# Filter for first 1000 male and first 1000 female entries
male_subset = df[df['Gender'] == 'male'].head(1000)
female_subset = df[df['Gender'] == 'female'].head(1000)

# Combine them
balanced_df = pd.concat([male_subset, female_subset], ignore_index=True)

# Save to new CSV
output_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\GenderStance\GenderStance_balanced_sample.csv"
balanced_df.to_csv(output_path, index=False)

print(f"Saved balanced sample to:\n{output_path}")

Saved balanced sample to:
C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\GenderStance\GenderStance_balanced_sample.csv


In [3]:
import pandas as pd
import os
import time
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI()

# Load your dataset
file_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\GenderStance\GenderStance_balanced_sample.csv"
df = pd.read_csv(file_path)

# Prompt builder
def create_prompt(text):
    return f"""You're a gender bias detection assistant. Analyze the following sentence for potential gender bias, ANY AMOUNT OF BIAS SHOULD CAUSE YOU TO SAY BIASED.

Sentence: "{text}"

Respond with exactly one word: "Biased" or "Not Biased"."""

# Run GPT-4o to label bias
def detect_bias(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": create_prompt(text)
            }],
            temperature=0
        )
        reply = response.choices[0].message.content.strip()
        print(f"GPT response: {reply}")
        return reply
    except Exception as e:
        print(f"Error on text: {text[:60]}... → {e}")
        return "Error"

# Loop through entire DataFrame
results = []
for i, row in df.iterrows():
    print(f"\nProcessing {i+1}/{len(df)} → {row['Text'][:60]}...")
    result = detect_bias(row["Text"])
    results.append(result)
    time.sleep(1.2)  # ~50 requests per minute (safe for gpt-4o)

# Add results to DataFrame
df["GPT_Bias_Label"] = results

# Save to output CSV
output_path = r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\GenderStance\GenderStance_gpt_labeled.csv"
df.to_csv(output_path, index=False)

print(f"\n✅ All done! Saved GPT-labeled file to:\n{output_path}")



Processing 1/2000 → my wife believes that the involvement of religion in politic...
GPT response: Not Biased

Processing 2/2000 → my mother believes that the involvement of religion in polit...
GPT response: Not Biased

Processing 3/2000 → my mom believes that the involvement of religion in politics...
GPT response: Not Biased

Processing 4/2000 → my wife believes that coercive population control methods en...
GPT response: Not Biased

Processing 5/2000 → my mother believes that coercive population control methods ...
GPT response: Not Biased

Processing 6/2000 → my mom believes that coercive population control methods ena...
GPT response: Not Biased

Processing 7/2000 → my wife believes that punishing news organizations for perce...
GPT response: Not Biased

Processing 8/2000 → my mother believes that punishing news organizations for per...
GPT response: Not Biased

Processing 9/2000 → my mom believes that punishing news organizations for percei...
GPT response: Not Biased

Processin

In [4]:
import pandas as pd

input_path = 'C:/Users/shrit/Desktop/Ml_Projects/DeepRead/DeepRead/data/GenderStance/GenderStance_gpt_labeled.csv'

output_path = 'C:/Users/shrit/Desktop/Ml_Projects/DeepRead/DeepRead/data/GenderStance/gender_final_data.csv'

df = pd.read_csv(input_path)

filtered_df = df[['Text', 'GPT_Bias_Label']]

filtered_df['GPT_Bias_Label'] = filtered_df['GPT_Bias_Label'].apply(lambda x: 1 if x.strip().lower() == 'biased' else 0)

filtered_df.to_csv(output_path, index=False)

print("Filtered and encoded CSV saved successfully!")

Filtered and encoded CSV saved successfully!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['GPT_Bias_Label'] = filtered_df['GPT_Bias_Label'].apply(lambda x: 1 if x.strip().lower() == 'biased' else 0)


In [5]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load your gender bias dataset
df = pd.read_csv(r"C:\Users\shrit\Desktop\Ml_Projects\DeepRead\DeepRead\data\GenderStance\gender_final_data.csv")

# Clean data
df = df.dropna(subset=["Text", "GPT_Bias_Label"])
df["GPT_Bias_Label"] = df["GPT_Bias_Label"].astype(int)

# Convert to HF dataset
dataset = Dataset.from_pandas(df.rename(columns={"Text": "text", "GPT_Bias_Label": "label"}))
split_dataset = dataset.train_test_split(test_size=0.2)

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = split_dataset.map(tokenize_fn, batched=True)

# Model (binary classification)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results_gender_bias",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)




Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3397,0.362198,0.8375,0.810496,0.852761,0.772222
2,0.3302,0.319227,0.88,0.876289,0.817308,0.944444
3,0.2344,0.303844,0.865,0.859375,0.808824,0.916667
4,0.2626,0.298486,0.865,0.857143,0.818182,0.9
5,0.2328,0.300312,0.865,0.857895,0.815,0.905556


Evaluation Results: {'eval_loss': 0.2984856069087982, 'eval_accuracy': 0.865, 'eval_f1': 0.8571428571428571, 'eval_precision': 0.8181818181818182, 'eval_recall': 0.9, 'eval_runtime': 0.5921, 'eval_samples_per_second': 675.558, 'eval_steps_per_second': 21.956, 'epoch': 5.0}


In [6]:
model.save_pretrained("./gender_bias_model")

tokenizer.save_pretrained("./gender_bias_tokenizer")


('./gender_bias_tokenizer\\tokenizer_config.json',
 './gender_bias_tokenizer\\special_tokens_map.json',
 './gender_bias_tokenizer\\vocab.txt',
 './gender_bias_tokenizer\\added_tokens.json',
 './gender_bias_tokenizer\\tokenizer.json')