In [None]:
import pandas as pd
import torch
from transformers import pipeline

In [None]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("bld/python/data/data_clean.csv")

# Current Approach

In [None]:
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
model_name = "facebook/bart-large-mnli"

In [None]:
pipe = pipeline(model=model_name)

In [None]:
def process_row(row):
    return pipe(row, candidate_labels=candidate_labels)

In [None]:
df = df[:10]

In [None]:
df["Classification"] = df["Article text"].apply(process_row)

In [None]:
df["Classification"][3]

# (not useful) handle batches of data

In [None]:
df

In [None]:
batch_size = 16  # Set your desired batch size

# Calculate the number of batches needed
num_batches = (len(df) + batch_size - 1) // batch_size

# Split the DataFrame into batches and process each batch
for batch_idx in range(num_batches):
    batch_start = batch_idx * batch_size
    batch_end = min((batch_idx + 1) * batch_size, len(df))

    batch_data = df.iloc[batch_start:batch_end]

In [None]:
batch_data

# put to tensors

In [None]:
import torch

In [None]:
text_column = df["Article text"]

In [None]:
text_list = text_column.tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

In [None]:
tokenized_batch = tokenizer(
    text_list,
    padding=True,
    truncation=True,
    return_tensors="pt",
)

In [None]:
input_ids = tokenized_batch["input_ids"]
attention_mask = tokenized_batch["attention_mask"]

In [None]:
input_ids_tensor = input_ids.clone().detach()
attention_mask_tensor = attention_mask.clone().detach()

In [None]:
attention_mask_tensor

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name,
    tokenizer=tokenizer,
    multi_label=True,
)

In [None]:
result = classifier(text_list, candidate_labels, attention_mask=attention_mask)

In [None]:
print(result)

# New Approach

# Best approach

Note: Lecture 7 is key to my problem

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, pipeline

In [None]:
df = load_from_disk("bld/python/data/data_clean")
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
model_name = "facebook/bart-large-mnli"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["Article text"], padding="max_length", truncation=True)

In [None]:
df_encoded = df.map(tokenize, batched=True, batch_size=None)

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
sequence_to_classify = (
    "Tiger Woods: Is this the end of his era? - CNN,Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. ",
    "golf, Tiger Woods: Is this the end of his era? - CNN,Is this the end of the Tiger Woods era?,This story was excerpted from the November 23 edition of CNN's Meanwhile in America, the daily email about US politics for global readers. Click here to read past editions and subscribe. (CNN)Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. Woods, who is recuperating from devastating leg injuries from a car crash, told Golf Digest he would have to be more selective about competition from now on. "
    "I think something that is realistic",
)

In [None]:
classifier(sequence_to_classify, candidate_labels, tokenizer=tokenizer)

## Functionize it

#### Reasoning for new model

https://huggingface.co/valhalla/distilbart-mnli-12-1 has 90% of the facebook/bart-large-mnli model's performance but is way faster

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, pipeline

In [None]:
df = load_from_disk("bld/python/data/data_clean")
model_name = "facebook/bart-large-mnli"

In [None]:
model_name_2 = "valhalla/distilbart-mnli-12-1"

In [None]:
from transformers import AutoTokenizer


def zero_shot_labelling(data):
    model_name = "valhalla/distilbart-mnli-12-1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16,  # adjust batch size
    )


# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"


def _tokenize(batch, tokenizer):
    return tokenizer(batch["Description"], padding=True, truncation=True, max_length=42)


# Call zero_shot_labelling function

In [None]:
# automodel = AutoModelForSequenceClassification.from_pretrained(model_name)
from transformers import AutoTokenizer, AutoModel, , AutoModelForSequenceClassification


def zero_shot_labelling(data):
    model_name = "facebook/bart-large-mnli"
    tokenizer = AutoModelForSequenceClassification.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16, # adjust batch size
    )
# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"

def _tokenize(batch, tokenizer):
    return tokenizer(batch["Description"], padding="max_length", truncation=True)


# Call zero_shot_labelling function
df_encoded = zero_shot_labelling(df)

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name_2,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
sequence_to_classify = (
    "Tiger Woods: Is this the end of his era? - CNN,Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. ",
    "golf, Tiger Woods: Is this the end of his era? - CNN,Is this the end of the Tiger Woods era?,This story was excerpted from the November 23 edition of CNN's Meanwhile in America, the daily email about US politics for global readers. Click here to read past editions and subscribe. (CNN)Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. Woods, who is recuperating from devastating leg injuries from a car crash, told Golf Digest he would have to be more selective about competition from now on. "
    "I think something that is realistic",
)

In [None]:
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
from datasets import Dataset, DatasetDict

df_try = df


def pd_to_dataset(data):
    data = Dataset.from_pandas(data)
    dataset_dict = DatasetDict({"my_dataset": data})
    return dataset_dict["my_dataset"]


df_encoded = zero_shot_labelling(df_try)

In [None]:
df_encoded

In [None]:
classifier(df_encoded["Description"], candidate_labels, tokenizer=_tokenize)

need to speed it up:
- Batch size of 8
- padding can be reduced to speed up computation
- 

## Approach to be faster

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

nli_model = AutoModelForSequenceClassification.from_pretrained(
    "facebook/bart-large-mnli",
)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

In [None]:
df_encoded = [{"Article text": str(item["Article text"])} for item in df_encoded]

In [None]:
x = tokenizer.encode(df_encoded["Article text"],, return_tensors='pt',
                     truncation_strategy='only_first')

In [None]:
from transformers import AutoTokenizer


def zero_shot_labelling(data):
    model_name = "valhalla/distilbart-mnli-12-1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16,  # adjust batch size
    )


# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"


def _tokenize(batch, tokenizer):
    return tokenizer(batch["Article text"], padding="max_length", truncation=True)


# Call zero_shot_labelling function
df_encoded = zero_shot_labelling(df)

## free up space every time before running the code

In [None]:
from huggingface_hub import scan_cache_dir

delete_strategy = scan_cache_dir().delete_revisions(
    "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
    "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
    "6c0e6080953db56375760c0471a8c5f2929baf11",
)
print("Will free " + delete_strategy.expected_freed_size_str)


delete_strategy.execute()

# Specify the directory you want to clear the cache for
cache_directory = "/path/to/your/cache/directory"

# Use scan_cache_dir to clear the cache in the specified directory
scan_cache_dir(cache_directory).clear()

## More cleaning

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("bld/python/data/data_clean.csv")

In [None]:
aList = df

In [None]:
import csv

with open(self.cleaned_data.csv) as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_NONE)
    for row in reader:
        yield (cell.strip() for cell in row)

In [None]:
df = df.dropna(how="all")

In [None]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
df

In [None]:
df = df[df.apply(lambda row: len(row[0].split(",")) == 4, axis=1)]

## Internet Notepad Approach

In [None]:
from datasets import Dataset, DatasetDict, load_from_disk

df = load_from_disk("bld/python/data/data_clean")

In [None]:
df

In [None]:
df_csv = pd.read_csv("bld/python/data/data_clean.csv")

In [None]:
import torch
from torch.utils.data.dataset import Dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")  # choose model

In [None]:
df_csv = df_csv[["Article text", "Category"]]
df_csv["Category"] = df_csv["Category"].astype("category")
df_csv["text"] = df_csv["Article text"]
df_csv = df_csv.drop("Article text", axis=1)

In [None]:
df_csv["Category_code"] = df_csv.Category
df_csv["Category_code"] = df_csv.Category.cat.codes

In [None]:
df_csv["label"] = df_csv["Category_code"]
df_csv = df_csv.drop("Category_code", axis=1)
df_csv.head()

In [None]:
max_length = 512


def tokenize_data_2(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

In [None]:
df_csv = df_csv.map(tokenize_data_2, batched=True)

# Neuer try

In [None]:
import pandas as pd
import torch

In [None]:
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, pipeline

df = load_from_disk("bld/python/data/data_clean")

In [None]:
df = zero_shot_labelling(df)

In [None]:
from transformers import AutoTokenizer


def zero_shot_labelling(data):
    model_name = "valhalla/distilbart-mnli-12-1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16,  # adjust batch size
    )


# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"


def _tokenize(batch, tokenizer):
    return tokenizer(batch["Description"], padding=True, truncation=True, max_length=42)


# Call zero_shot_labelling function

In [None]:
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
model_name_2 = "valhalla/distilbart-mnli-12-1"

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name_2,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
classifier(df["Description"], candidate_labels, tokenizer=_tokenize)

In [None]:
df

## just functions

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, pipeline

df = load_from_disk("bld/python/data/data_clean")

In [None]:
def select_random_entries(dataframe, num_entries=50, random_state=42):
    """Select a random set of entries from a Pandas DataFrame.

    Parameters:
    - dataframe (pd.DataFrame): The input DataFrame with 6 columns.
    - num_entries (int): The number of random entries to select (default is 50).
    - random_state (int or None): Random seed for reproducibility (default is None).

    Returns:
    - pd.DataFrame: A DataFrame containing the randomly selected entries.
    """
    dataframe = pd.DataFrame(dataframe)
    # dataframe is json

    if random_state is not None:
        np.random.seed(random_state)  # Set the random seed

    # Check if num_entries is greater than the total number of rows
    if num_entries > len(dataframe):
        msg = "Number of entries to select cannot exceed the total number of rows."
        raise ValueError(
            msg,
        )

    # Use Pandas' sample method to select random entries
    return dataframe.sample(n=num_entries)

In [None]:
select_random_entries(df, num_entries=50, random_state=42)

In [None]:
pd.DataFrame(df)

In [None]:
type(df)

In [None]:
first_100_entries = df.select(range(100))

In [None]:
class= zero_shot_classifier(first_100_entries)

In [None]:
import random

import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, pipeline

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


def zero_shot_classifier(data):
    """Classify the zero-shot data to receive the labels."""
    data = _zero_shot_labelling(data)
    model_name = "valhalla/distilbart-mnli-12-6"
    labels = ["labor supply", "labor demand", "government intervention"]
    classifier = pipeline(  # second last
        "zero-shot-classification",
        model=model_name,
        multi_label=True,
        device="cuda:0" if torch.cuda.is_available() else None,
    )
    return classifier(  # last
        data["Description"],
        labels,
        tokenizer=_tokenize,
    )


def _zero_shot_labelling(data):
    """Load the model for zero-shot classification and apply on the data."""
    model_name = "valhalla/distilbart-mnli-12-6"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=8,
    )


def _tokenize(batch, tokenizer):
    """Define the tokenizer."""
    return tokenizer(
        batch["Description"],
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

# For reading to know

In [None]:
# run the previous
select_random_entries(df, num_entries=50, random_state=42)

# Training the model

TODO:
- does the probability of the model suits or should I transform to 0 and 1
- test and training separation
- model selection
- put the head on it

In [None]:
import json

import pandas as pd
import torch
from transformers import AutoTokenizer, pipeline

In [None]:
file_path = "bld\\python\\labelled\\data_labelled_subset.json"

df = pd.read_json(file_path)

In [None]:
df

In [None]:
# split the data
# better way with arrays
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["sequence"],
    df["scores"],
    test_size=0.15,
    random_state=8,
)

In [None]:
adadd = load_from_disk("bld/python/labelled/data_labelled_subset.json")

In [None]:
from datasets import load_from_disk

data = load_from_disk("bld/python/data/data_clean")
first_100_entries = data.select(range(100))

In [None]:
df = zero_shot_classifier(first_100_entries)

In [None]:
zero_shot_labelling(first_100_entries)

In [None]:
def zero_shot_labelling(data):
    """Load the model for zero-shot classification and apply on the data."""
    model_name = "valhalla/distilbart-mnli-12-6"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    df_encoded = data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=8,
    )
    df_encoded.set_format(
        "torch",
        columns=["input_ids", "attention_mask"],
    )
    df_encoded.set_format("torch")
    return df_encoded


def _tokenize(batch, tokenizer):
    """Define the tokenizer."""
    return tokenizer(
        batch["Description"],
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

In [None]:
"""Functions for fitting the regression model."""

import random

import torch
from transformers import AutoTokenizer, pipeline


def _zero_shot_labelling(data):
    """Load the model for zero-shot classification and apply on the data."""
    model_name = "valhalla/distilbart-mnli-12-6"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    df_encoded = data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=8,
    )
    df_encoded.set_format(
        "torch",
        columns=["input_ids", "attention_mask"],
    )
    df_encoded.set_format("torch")
    return df_encoded


def _tokenize(batch, tokenizer):
    """Define the tokenizer."""
    return tokenizer(
        batch["Description"],
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

In [None]:
import json

# Specify the file path
file_path = "bld\\python\\labelled\\data_labelled_subset.json"

# Open and load the JSON file
with open(file_path) as json_file:
    df = json.load(json_file)

In [None]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["sequence"],
    df["scores"],
    test_size=0.15,
    random_state=8,
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
max_length = 512


def tokenize_data(df):
    return tokenizer(
        df["sequence"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

In [None]:
df = df.map(tokenize_data, batched=True)