
Setting up environment

In [1]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')


    basepath = '/content/drive/MyDrive'

else:
    basepath = '/home/harpreet/Insync/google_drive_shaannoor/data'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load Libraries

In [2]:
from transformers import pipeline
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [3]:
import wandb
from google.colab import userdata
from huggingface_hub import login

In [4]:
# Initialize Weights & Biases (Replace "your_project_name" with your W&B project name)
wandb.init(project="zero-shot")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpxy230011[0m ([33mpxy230011-the-university-of-texas-at-dallas[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# CHANGE FOLDERS TO WHERE YOU WANT TO SAVE DATA AND MODELS
base_folder = Path(basepath)
data_folder = base_folder/'datasets/HW6/Datafolder'

In [6]:
from huggingface_hub import login
login(token='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')

Load Dataset

In [7]:
# Load datasets
train_path = (data_folder/'train.csv')
test_path = (data_folder/'test.csv')
submission_path = (data_folder/'sample_submission.csv')

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
sample_submission = pd.read_csv(submission_path)

In [8]:
# Define emotion labels (columns in train data excluding 'ID' and 'Tweet')
emotion_labels = train_data.columns[2:]

In [9]:
from datasets import Dataset
# Convert test_data to a Dataset object
test_dataset = Dataset.from_pandas(test_data)

Class weights

In [10]:
# Calculate class weights (inverse of class frequency)
class_weights = {}
total_samples = len(train_data)
for label in emotion_labels:
    positive_samples = train_data[label].sum()
    class_weights[label] = total_samples / (positive_samples + 1e-6)  # Avoid division by zero

# Normalize class weights
max_weight = max(class_weights.values())
for label in class_weights:
    class_weights[label] /= max_weight

Zero-shot classification

In [11]:
# Initialize zero-shot classification pipeline with BART-large-mnli
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
# Convert test dataset into batches for efficient processing
batch_size = 4
test_batches = [test_data[i:i + batch_size] for i in range(0, len(test_data), batch_size)]

Predict

In [13]:
# Predict on the test set in batches
all_predictions = []

for batch in tqdm(test_batches, desc="Processing Batches"):
    batch_texts = batch["Tweet"].tolist()
    results = classifier(
        batch_texts,
        candidate_labels=emotion_labels.tolist(),
        multi_label=True
    )
    for result in results:
        # Adjust probabilities with class weights
        adjusted_scores = {
            label: score * class_weights[label]
            for label, score in zip(result["labels"], result["scores"])
        }
        # Convert scores to binary predictions (threshold = 0.5 after adjustment)
        binary_predictions = {label: int(adjusted_scores[label] > 0.5) for label in emotion_labels}
        all_predictions.append(binary_predictions)

Processing Batches:   1%|          | 10/815 [00:09<11:32,  1.16it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Batches: 100%|██████████| 815/815 [11:59<00:00,  1.13it/s]


Prepare submission file

In [14]:
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame(all_predictions)

# Prepare the submission file
submission = pd.concat([test_data["ID"], predictions_df], axis=1)
submission.to_csv("submission_zershot.csv", index=False)

In [16]:
from google.colab import files
# Download the saved submission file
files.download("submission_zershot.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>