In [1]:
!pip install -r MSTHESIS/week5/requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [10]:
import pandas as pd
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import torch
import os
from collections import Counter
import time
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.utils import resample
from sklearn import metrics

In [12]:
RANDOM_SEED = 42
TEST_SIZE = 0.2
SAMPLE_SIZE = 3000
BATCH_SIZE = 32
# os.chdir('./MSTHESIS/week5')


In [13]:

def train_anything(rdf, modelName, uncased=True, num_classes=2, label_col='label'):
    start_time = time.time()
    directory_path = "./models/"+modelName

    os.makedirs(directory_path, exist_ok=True)

    label_encoder = preprocessing.LabelEncoder()
    rdf['label'] = label_encoder.fit_transform(rdf[label_col].tolist())
    print(rdf['label'].value_counts())
    
    train_df, test_df = train_test_split(rdf, test_size=0.2, random_state=RANDOM_SEED)

    CASEMENT_NAME = "distilbert-base-uncased" if uncased else "distilbert-base-cased"

    tokenizer = AutoTokenizer.from_pretrained(CASEMENT_NAME)
        

    def tokenize_data(examples):
        return tokenizer(examples["raw"] if not uncased else examples["clean"], truncation=True)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    tokenized_train = train_dataset.map(tokenize_data, batched=True)
    tokenized_test = test_dataset.map(tokenize_data, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(CASEMENT_NAME, num_labels=num_classes)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=directory_path+"/results",
        learning_rate=2e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        # load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    trainer.save_model(directory_path+"/model")

    #prediction

    predictions = trainer.predict(tokenized_test)

    logits = predictions.predictions 
    predicted_labels = np.argmax(logits, axis=1) 
    true_labels = test_df['label'].values

    # Step 3: Compute Metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average="weighted")

    end_time = time.time()
    end_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return [modelName, accuracy, precision, recall, f1, start_time, (start_time-end_time), end_date]


In [31]:

def run_all_combos():
    rdf = pd.read_csv("week5_qbias_dataset.csv")
    if not os.path.exists("week5_results.csv"):
        resPD = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1", "Start_Time", "Duration", "End_Date"])
    else:
        resPD = pd.read_csv("week5_results.csv")
    results = []
    rdf.dropna(subset=['clean'], inplace=True)
    rdf = rdf[rdf['clean'].str.len() > 0]
    
    MODEL_SIZE = "small"
    if MODEL_SIZE == "small":
        rdf = rdf.sample(SAMPLE_SIZE, random_state=RANDOM_SEED)
    
    
    
#     for uncased in [True, False]:
#         results.append(train_anything(rdf, f"{MODEL_SIZE}_multiclass_{'uncased' if uncased else 'cased'}", uncased, 3, 'bias_rating'))
    
    # for uncased in [True, False]:
    #     results.append(train_anything(rdf, f"{MODEL_SIZE}_binary_left{'' if uncased else '_cased'}", uncased, 2, 'label_left'))

    #not left
#     rdf['label_not_left'] = rdf['label_left'].apply(lambda x: 0 if x == 1 else 1)
#     results.append(train_anything(rdf, f"{MODEL_SIZE}_binary_not_left", False, 2, 'label_not_left'))
    
    #right 
    results.append(train_anything(rdf, f"{MODEL_SIZE}_binary_right", False, 2, 'label_right'))

    #center
    results.append(train_anything(rdf, f"{MODEL_SIZE}_binary_center", False, 2, 'label_center'))
    
    #evenly sampled left
    rdf = pd.read_csv("week5_qbias_dataset.csv")
    rdf_yes = rdf[rdf['label_left'] > 0]
    rdf_no = rdf[rdf['label_left'] == 0]
    rdf_yes_resampled = resample(rdf_yes, 
                               replace=False, 
                               n_samples=SAMPLE_SIZE,
                               random_state=RANDOM_SEED)
        
    
    if MODEL_SIZE == "small": 
        rdf_no_resampled = resample(rdf_no, 
                                   replace=False, 
                                   n_samples=SAMPLE_SIZE,
                                   random_state=RANDOM_SEED)
        rdf_even_sample_left = pd.concat([rdf_yes_resampled, rdf_no_resampled], ignore_index = True)
    else:
        rdf_no_resampled = resample(rdf_no, 
                                       replace=False, 
                                       n_samples=rdf_yes.shape[0],
                                       random_state=RANDOM_SEED)
        rdf_even_sample_left = pd.concat([rdf_yes, rdf_no_resampled], ignore_index = True)


    results.append(train_anything(rdf_even_sample_left, f"{MODEL_SIZE}_binary_evenSplit_left", False, 2, 'label_left'))
    
    

    newresPd = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "Start_Time", "Duration", "End_Date"])
    pdResults = pd.concat([resPD, newresPd], ignore_index=True)
    pdResults.to_csv("week5_results.csv", index=False)



In [8]:

def eval_anything(rdf, modelName, labelColumn='bias_rating'):
    model_path = "./models/"+modelName+"/model"   

    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    label_encoder = preprocessing.LabelEncoder()
    rdf['label'] = label_encoder.fit_transform(rdf[labelColumn].tolist())
    rdf.dropna(subset=['clean'], inplace=True)
    headlines = rdf["clean"].tolist()
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    batch_size = 32 
    def tokenize_batch(batch_texts):
        return tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt")

    model.eval()

    all_predictions = []

    with torch.no_grad():
        for i in tqdm(range(0, len(headlines), batch_size), desc="Processing Batches"):
            batch_texts = headlines[i:i+batch_size]  # Get batch
            batch_inputs = tokenize_batch(batch_texts).to(device)  # Tokenize and move to GPU

            outputs = model(**batch_inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

            all_predictions.extend(batch_predictions)
    
    # Convert back to original bias categories
    decoded_labels = label_encoder.inverse_transform(all_predictions)

    true_labels = rdf['label'].values  # True labels

    accuracy = accuracy_score(true_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, all_predictions, average="weighted")

    return [modelName, accuracy, precision, recall, f1]



In [17]:

def eval_all_combos():
    rdf = pd.read_csv("week5_qbias_dataset.csv")
    if not os.path.exists("week5_evals.csv"):
        resPD = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1", "EvalDate"])
    else:
        resPD = pd.read_csv("week5_results.csv")
    results = []
    train_df, test_df = train_test_split(rdf, test_size=0.2, random_state=RANDOM_SEED)
    rdf = test_df
    rdf.dropna(subset=['clean'], inplace=True)
    rdf = rdf[rdf['clean'].str.len() > 0]
    
    
    # rdf = rdf.sample(SAMPLE_SIZE, random_state=RANDOM_SEED+5)
    
    MODEL_SIZE = ""
    # for uncased in [True, False]:
        # results.append(eval_anything(rdf, f"{MODEL_SIZE}_multiclass_{'uncased' if uncased else 'cased'}", 'bias_rating'))
    results.append(eval_anything(rdf, "multiclass_uncased", 'bias_rating'))
    results.append(eval_anything(rdf, "multiclass_left", 'label_left'))
    
#     for uncased in [True, False]:
#         results.append(eval_anything(rdf, f"{MODEL_SIZE}_binary_left{'' if uncased else '_cased'}", 'label_left'))

#     #not left
#     rdf['label_not_left'] = rdf['label_left'].apply(lambda x: 0 if x == 1 else 1)
#     results.append(eval_anything(rdf, f"{MODEL_SIZE}_binary_not_left", 'label_not_left'))
    
#     #right 
#     results.append(eval_anything(rdf, f"{MODEL_SIZE}_binary_right", 'label_right'))

#     #center
#     results.append(eval_anything(rdf, f"{MODEL_SIZE}_binary_center", 'label_center'))
    
    #evenly sampled center
    rdf = pd.read_csv("week5_qbias_dataset.csv")
    rdf_yes = rdf[rdf['label_left'] > 0]
    rdf_no = rdf[rdf['label_left'] == 0]

    rdf_no_resampled = resample(rdf_no, 
                                   replace=False, 
                                   n_samples=SAMPLE_SIZE,
                                   random_state=RANDOM_SEED)
    rdf_yes_resampled = resample(rdf_yes, 
                                   replace=False, 
                                   n_samples=SAMPLE_SIZE,
                                   random_state=RANDOM_SEED)
    
    rdf_even_sample_left = pd.concat([rdf_yes_resampled, rdf_no_resampled], ignore_index = True)

    # rdf_no_resampled = resample(rdf_no, 
    #                                replace=False, 
    #                                n_samples=rdf_yes.shape[0],
    #                                random_state=RANDOM_SEED)
    
    # rdf_even_sample_left = pd.concat([rdf_yes, rdf_no_resampled], ignore_index = True)

    results.append(eval_anything(rdf_even_sample_left, "binary_evenSplit_left", 'label_left'))
    
    
    newresPd = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
    newresPD["EvalDate"] =  datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    pdResults = pd.concat([resPD, newresPd], ignore_index=True)
    pdResults.to_csv("week5_evals2.csv", index=False)



In [41]:

rdf = pd.read_csv("week5_qbias_dataset.csv")
rdf = rdf.sample(2000, random_state=RANDOM_SEED)
model_path = "./models/binary_right/model"   

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

label_encoder = preprocessing.LabelEncoder()
rdf['label'] = label_encoder.fit_transform(rdf['label_right'].tolist())
rdf.dropna(subset=['clean'], inplace=True)
headlines = rdf['clean'].tolist()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 32
def tokenize_batch(batch_texts):
    return tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt")

model.eval()

all_predictions = []

with torch.no_grad():
    for i in tqdm(range(0, len(headlines), batch_size), desc="Processing Batches"):
        batch_texts = headlines[i:i+batch_size]
        batch_inputs = tokenize_batch(batch_texts).to(device)
        
        outputs = model(**batch_inputs)
        batch_predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        
        all_predictions.extend(batch_predictions)

# Convert back to original bias categories
decoded_labels = label_encoder.inverse_transform(all_predictions)

# Compute metrics
true_labels = rdf['label'].values
accuracy = accuracy_score(true_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, all_predictions, average="weighted")

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")




Processing Batches: 100%|██████████| 63/63 [03:15<00:00,  3.10s/it]

Accuracy: 0.6715, Precision: 0.4509, Recall: 0.6715, F1: 0.5395



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
# decoded_labels
# true_labels
# all_predictions
# accuracy
# print(rdf['label'].value_counts())
# # print(rdf[['label_center', 'label']].head())


recall_df = pd.DataFrame({
    "text": rdf["clean"].tolist(),  # Original text
    "true_label": rdf["label"].values,  # True labels
    "predicted_label": all_predictions  # Model predictions
})

# recall_df[(recall_df["true_label"] == 0) & (recall_df["predicted_label"] == 0)].shape
# # ['text'][0]
# recall_df['true_label'].value_counts()
# confusion_matrix = metrics.confusion_matrix(true_labels, all_predictions)
# confusion_matrix

label
0    1343
1     657
Name: count, dtype: int64


In [72]:
# recall_df.head()
# recall_df.iloc[4,0]

# srdf = rdf.sample(SAMPLE_SIZE, random_state=RANDOM_SEED)
rdf.head()
# rdf.iloc[3,2]

Unnamed: 0,raw,bias_rating,clean,label_right,label_left,label_center,label
13167,[HEADLINE]Derek Chauvin trial and George Floyd...,left,[headline]derek chauvin trial and george floyd...,0,1,0,0
4068,[HEADLINE]Democratic platform focuses on fixin...,right,[headline]democratic platform focuses on fixin...,1,0,0,1
9857,[HEADLINE]Omicron-specific vaccine boosters ge...,left,[headline]omicron-specific vaccine boosters ge...,0,1,0,0
20210,[HEADLINE]Annoying to the End — Jeff Flake Pub...,right,[headline]annoying to the end — jeff flake pub...,1,0,0,1
7655,[HEADLINE]Oklahoma's new abortion law doesn't ...,center,[headline]oklahoma's new abortion law doesn't ...,0,0,1,0


In [71]:
def predict_single(text, model, tokenizer, label_encoder, device):
    model.eval()  # Set model to evaluation mode
    
    # Tokenize the input text
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    
    # Run the model to get predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Convert logits to predicted label
    predicted_label = torch.argmax(outputs.logits, dim=-1).cpu().numpy()[0]
    
    # Convert back to original label categories
    decoded_label = label_encoder.inverse_transform([predicted_label])[0]
    
    return decoded_label

# Example usage
text = rdf.iloc[3,2]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
prediction = predict_single(text, model, tokenizer, label_encoder, device)

print(f"Predicted label: {prediction}")


Predicted label: 0


In [None]:
run_all_combos()

In [18]:
eval_all_combos()

Processing Batches:   0%|          | 0/136 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing Batches:   0%|          | 0/136 [00:01<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [8]:
results = pd.read_csv("week5_results.csv")
results.head(10)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Start_Time,Duration,End_Date
0,multiclass_uncased,1.0,1.0,1.0,1.0,1741472000.0,-34.02383,2025-03-08 17:11:03
1,multiclass_cased,1.0,1.0,1.0,1.0,1741472000.0,-32.221397,2025-03-08 17:11:35
2,binary_left,1.0,1.0,1.0,1.0,1741472000.0,-31.957928,2025-03-08 17:12:07
3,binary_left_cased,1.0,1.0,1.0,1.0,1741472000.0,-31.746595,2025-03-08 17:12:39
4,binary_not_left,1.0,1.0,1.0,1.0,1741472000.0,-31.091362,2025-03-08 17:13:10
5,binary_right,1.0,1.0,1.0,1.0,1741472000.0,-31.184011,2025-03-08 17:13:41
6,binary_center,1.0,1.0,1.0,1.0,1741472000.0,-32.180439,2025-03-08 17:14:13
7,binary_evenSplit_left,0.75,0.5625,0.75,0.642857,1741472000.0,-52.050959,2025-03-08 17:15:06
8,multiclass_uncased,1.0,1.0,1.0,1.0,1741476000.0,-34.511598,2025-03-08 18:16:09
9,multiclass_cased,1.0,1.0,1.0,1.0,1741476000.0,-35.182627,2025-03-08 18:16:44
