In [None]:
import pandas as pd
import torch
from transformers import pipeline

In [None]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("bld/python/data/data_clean.csv")

# Current Approach

In [None]:
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
model_name = "facebook/bart-large-mnli"

In [None]:
pipe = pipeline(model=model_name)

In [None]:
def process_row(row):
    return pipe(row, candidate_labels=candidate_labels)

In [None]:
df = df[:10]

In [None]:
df["Classification"] = df["Article text"].apply(process_row)

In [None]:
df["Classification"][3]

# (not useful) handle batches of data

In [None]:
df

In [None]:
batch_size = 16  # Set your desired batch size

# Calculate the number of batches needed
num_batches = (len(df) + batch_size - 1) // batch_size

# Split the DataFrame into batches and process each batch
for batch_idx in range(num_batches):
    batch_start = batch_idx * batch_size
    batch_end = min((batch_idx + 1) * batch_size, len(df))

    batch_data = df.iloc[batch_start:batch_end]

In [None]:
batch_data

# put to tensors

In [None]:
import torch

In [None]:
text_column = df["Article text"]

In [None]:
text_list = text_column.tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

In [None]:
tokenized_batch = tokenizer(
    text_list,
    padding=True,
    truncation=True,
    return_tensors="pt",
)

In [None]:
input_ids = tokenized_batch["input_ids"]
attention_mask = tokenized_batch["attention_mask"]

In [None]:
input_ids_tensor = input_ids.clone().detach()
attention_mask_tensor = attention_mask.clone().detach()

In [None]:
attention_mask_tensor

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name,
    tokenizer=tokenizer,
    multi_label=True,
)

In [None]:
result = classifier(text_list, candidate_labels, attention_mask=attention_mask)

In [None]:
print(result)

# New Approach

# Best approach

Note: Lecture 7 is key to my problem

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [None]:
model_name = "facebook/bart-large-mnli"

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["Article text"], padding="max_length", truncation=True)

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

In [None]:
dataset = Dataset.from_pandas(df)
dataset_dict = DatasetDict({"my_dataset": dataset})
torch_data = dataset_dict["my_dataset"]

In [None]:
def pd_to_dataset(df):
    dataset = Dataset.from_pandas(df)
    dataset_dict = DatasetDict({"my_dataset": dataset})
    return dataset_dict["my_dataset"]

In [None]:
dataset.save_to_disk("./torch_data")

In [None]:
df_encoded = torch_data.map(tokenize, batched=True, batch_size=None)

In [None]:
df_encoded

In [None]:
df = pd.read_csv("bld/python/data/data_clean.csv")
candidate_labels = ["labor supply", "labor demand", "government intervention"]
import transformers

In [None]:
classifier = pipeline("zero-shot-classification", model=model_name, multi_label=True)

In [None]:
sequence_to_classify = (
    "Tiger Woods: Is this the end of his era? - CNN,Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. ",
    "golf, Tiger Woods: Is this the end of his era? - CNN,Is this the end of the Tiger Woods era?,This story was excerpted from the November 23 edition of CNN's Meanwhile in America, the daily email about US politics for global readers. Click here to read past editions and subscribe. (CNN)Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. Woods, who is recuperating from devastating leg injuries from a car crash, told Golf Digest he would have to be more selective about competition from now on. "
    "I think something that is realistic",
)

In [None]:
classifier = transformers.pipeline(
    "zero-shot-classification",
    model=model_name,
    multi_label=True,
)

In [None]:
def tokenize(batch):
    return tokenizer(batch["Article text"], padding=True, truncation=True)

In [None]:
classifier(sequence_to_classify, candidate_labels, tokenizer=tokenizer)

In [None]:
df_encoded = sequence_to_classify.map(tokenize, batched=True, batch_size=None)

# Random internet approach

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [None]:
bart_model = AutoModelForSequenceClassification.from_pretrained(
    "navteca/bart-large-mnli",
)

In [None]:
bart_tokenizer = AutoTokenizer.from_pretrained("navteca/bart-large-mnli")

In [None]:
nlp = pipeline("zero-shot-classification", model=bart_model, tokenizer=bart_tokenizer)

In [None]:
sequence = "i can perform article"

In [None]:
labels = ["writing", "management", "checking"]

In [None]:
nlp(sequence, labels)

# Manual Pytorch Approach

In [None]:
# pose sequence as a NLI premise and label as a hypothesis
from transformers import AutoModelForSequenceClassification, AutoTokenizer

nli_model = AutoModelForSequenceClassification.from_pretrained(
    "facebook/bart-large-mnli",
)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

premise = sequence
hypothesis = f"This example is {label}."

# run through model pre-trained on MNLI
x = tokenizer.encode(
    premise,
    hypothesis,
    return_tensors="pt",
    truncation_strategy="only_first",
)
logits = nli_model(x.to(device))[0]

# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true
entail_contradiction_logits = logits[:, [0, 2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:, 1]