# Best approach

Note: Lecture 7 is key to my problem

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, pipeline

In [None]:
df = load_from_disk("bld/python/data/data_clean")
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
model_name = "facebook/bart-large-mnli"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["Article text"], padding="max_length", truncation=True)

In [None]:
df_encoded = df.map(tokenize, batched=True, batch_size=None)

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
sequence_to_classify = (
    "Tiger Woods: Is this the end of his era? - CNN,Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. ",
    "golf, Tiger Woods: Is this the end of his era? - CNN,Is this the end of the Tiger Woods era?,This story was excerpted from the November 23 edition of CNN's Meanwhile in America, the daily email about US politics for global readers. Click here to read past editions and subscribe. (CNN)Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. Woods, who is recuperating from devastating leg injuries from a car crash, told Golf Digest he would have to be more selective about competition from now on. "
    "I think something that is realistic",
)

In [None]:
classifier(sequence_to_classify, candidate_labels, tokenizer=tokenizer)

## Functionize it

#### Reasoning for new model

https://huggingface.co/valhalla/distilbart-mnli-12-1 has 90% of the facebook/bart-large-mnli model's performance but is way faster

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, pipeline

In [None]:
df = load_from_disk("bld/python/data/data_clean")
model_name = "facebook/bart-large-mnli"

In [None]:
model_name_2 = "valhalla/distilbart-mnli-12-1"

In [None]:
from transformers import AutoTokenizer


def zero_shot_labelling(data):
    model_name = "valhalla/distilbart-mnli-12-1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16,  # adjust batch size
    )


# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"


def _tokenize(batch, tokenizer):
    return tokenizer(batch["Description"], padding=True, truncation=True, max_length=42)


# Call zero_shot_labelling function

In [None]:
# automodel = AutoModelForSequenceClassification.from_pretrained(model_name)
from transformers import AutoTokenizer, AutoModel, , AutoModelForSequenceClassification


def zero_shot_labelling(data):
    model_name = "facebook/bart-large-mnli"
    tokenizer = AutoModelForSequenceClassification.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16, # adjust batch size
    )
# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"

def _tokenize(batch, tokenizer):
    return tokenizer(batch["Description"], padding="max_length", truncation=True)


# Call zero_shot_labelling function
df_encoded = zero_shot_labelling(df)

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name_2,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
sequence_to_classify = (
    "Tiger Woods: Is this the end of his era? - CNN,Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. ",
    "golf, Tiger Woods: Is this the end of his era? - CNN,Is this the end of the Tiger Woods era?,This story was excerpted from the November 23 edition of CNN's Meanwhile in America, the daily email about US politics for global readers. Click here to read past editions and subscribe. (CNN)Tiger Woods is the rarest of athletes. At his peak, he transcended his sport. People who couldn't care less about golf watched in their millions on Sunday afternoons to see him roar. So the 15-time major champ's announcement that he is calling time on life as a full-time pro feels like the end of an era. Woods, who is recuperating from devastating leg injuries from a car crash, told Golf Digest he would have to be more selective about competition from now on. "
    "I think something that is realistic",
)

In [None]:
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
from datasets import Dataset, DatasetDict

df_try = df


def pd_to_dataset(data):
    data = Dataset.from_pandas(data)
    dataset_dict = DatasetDict({"my_dataset": data})
    return dataset_dict["my_dataset"]


df_encoded = zero_shot_labelling(df_try)

In [None]:
df_encoded

In [None]:
classifier(df_encoded["Description"], candidate_labels, tokenizer=_tokenize)

need to speed it up:
- Batch size of 8
- padding can be reduced to speed up computation
- 

## Approach to be faster

In [None]:
from huggingface_hub import scan_cache_dir

delete_strategy = scan_cache_dir().delete_revisions(
    "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
    "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
    "6c0e6080953db56375760c0471a8c5f2929baf11",
)
print("Will free " + delete_strategy.expected_freed_size_str)


delete_strategy.execute()

# Specify the directory you want to clear the cache for
cache_directory = "/path/to/your/cache/directory"

# Use scan_cache_dir to clear the cache in the specified directory
scan_cache_dir(cache_directory).clear()

# Neuer try

In [None]:
import pandas as pd
import torch

In [None]:
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, pipeline

df = load_from_disk("bld/python/data/data_clean")

In [None]:
df = zero_shot_labelling(df)

In [None]:
from transformers import AutoTokenizer


def zero_shot_labelling(data):
    model_name = "valhalla/distilbart-mnli-12-1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=16,  # adjust batch size
    )


# batch of 8: 47.4, padding = True
# batch of 16: 41.3, padding True
# batch of 16: 38.4, padding = "max_length"


def _tokenize(batch, tokenizer):
    return tokenizer(batch["Description"], padding=True, truncation=True, max_length=42)


# Call zero_shot_labelling function

In [None]:
candidate_labels = ["labor supply", "labor demand", "government intervention"]

In [None]:
model_name_2 = "valhalla/distilbart-mnli-12-1"

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model=model_name_2,
    multi_label=True,
    device="cuda:0" if torch.cuda.is_available() else None,
)

In [None]:
classifier(df["Description"], candidate_labels, tokenizer=_tokenize)

In [None]:
df

## just functions

In [None]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, pipeline

df = load_from_disk("bld/python/data/data_clean")

In [None]:
first_100_entries = df.select(range(100))

In [None]:
classif = zero_shot_classifier(first_100_entries)

In [None]:
import random

import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, pipeline

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


def zero_shot_classifier(data):
    """Classify the zero-shot data to receive the labels."""
    data = _zero_shot_labelling(data)
    model_name = "valhalla/distilbart-mnli-12-6"
    labels = ["labor supply", "labor demand", "government intervention"]
    classifier = pipeline(  # second last
        "zero-shot-classification",
        model=model_name,
        multi_label=True,
        device="cuda:0" if torch.cuda.is_available() else None,
    )
    return classifier(  # last
        data["Description"],
        labels,
        tokenizer=_tokenize,
    )


def _zero_shot_labelling(data):
    """Load the model for zero-shot classification and apply on the data."""
    model_name = "valhalla/distilbart-mnli-12-6"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return data.map(
        lambda batch: _tokenize(batch, tokenizer),
        batched=True,
        batch_size=8,
    )


def _tokenize(batch, tokenizer):
    """Define the tokenizer."""
    return tokenizer(
        batch["Description"],
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

# For reading to know

In [None]:
# run the previous
select_random_entries(df, num_entries=50, random_state=42)

# Training the model

TODO:
- does the probability of the model suits or should I transform to 0 and 1
- test and training separation
- model selection
- put the head on it

In [None]:
from datasets import load_from_disk

df = load_from_disk(r"bld\python\labelled\data_labelled_subset")

In [None]:
df

In [None]:
data = create_dataset_dict_2(df)

In [None]:
data.save_to_disk("bld/python/TrainTest")

In [None]:
import datasets


def _split_dataset(df):
    # Shuffle the dataset to ensure randomization
    df = df.shuffle(seed=42)

    # Calculate the split sizes
    total_size = len(df)
    train_size = int(0.8 * total_size)
    val_size = int(0.1 * total_size)
    total_size - train_size - val_size

    # Split the dataset
    train_dataset = datasets.Dataset.from_dict(df[:train_size])
    val_dataset = datasets.Dataset.from_dict(df[train_size : train_size + val_size])
    test_dataset = datasets.Dataset.from_dict(df[train_size + val_size :])

    # Rename columns if needed
    train_dataset = train_dataset.rename_column("sequence", "text")
    val_dataset = val_dataset.rename_column("sequence", "text")
    test_dataset = test_dataset.rename_column("sequence", "text")

    # You may need to specify the 'labels' column name if it's different
    # Assuming it's 'labels' in your dataset, rename it to 'label'
    train_dataset = train_dataset.rename_column("labels", "label")
    val_dataset = val_dataset.rename_column("labels", "label")
    test_dataset = test_dataset.rename_column("labels", "label")

    return {
        "train_dataset": train_dataset,
        "val_dataset": val_dataset,
        "test_dataset": test_dataset,
    }


def create_dataset_dict_2(df):
    # Split the dataset using the split_dataset function
    split_data = _split_dataset(df)

    # Create a DatasetDict containing train, validation, and test datasets
    return datasets.DatasetDict(split_data)


# Example usage:
# df = Your existing dataset

# Multilabel Problem

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
)

In [None]:
df = load_from_disk("bld/python/TrainTest/TrainTest_data/")

In [None]:
df["train_dataset"][0]

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt,
    problem_type="multi_label_classification",
)

In [None]:
df

In [None]:
def tokenize_and_encode(df):
    return tokenizer(df["text"], truncation=True)

In [None]:
cols = df["train_dataset"].column_names
cols

In [None]:
cols = df["train_dataset"].column_names
cols.remove("scores")
df_enc = df.map(tokenize_and_encode, batched=True, remove_columns=cols)
df_enc

Scores and labels are badly named

In [None]:
# cast label IDs to floats
import torch

df_enc.set_format("torch")
df_enc = df_enc.map(
    lambda x: {"float_labels": x["scores"].to(torch.float)},
    remove_columns=["scores"],
).rename_column("float_labels", "scores")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=1,
).to(device)

In [None]:
df_enc

In [None]:
args = TrainingArguments(".", num_train_epochs=1)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=df_enc["train_dataset"],
    eval_dataset=df_enc["val_dataset"],
    tokenizer=tokenizer,
)

In [None]:
from torch import nn

In [None]:
df_enc["train_dataset"]["input_ids"]

In [None]:
# Convert input lists to tensors
input_ids = torch.tensor(df_enc["train_dataset"]["input_ids"])
attention_mask = torch.tensor(df_enc["train_dataset"]["attention_mask"])

# Forward pass through the model to get logits
logits = model(input_ids, attention_mask=attention_mask)

# Compute the loss
loss = criterion(
    logits.logits,
    scores.float(),
)  # Use logits.logits to access the raw logits

# Apply a threshold to the logits to determine class predictions (e.g., 0.5)
threshold = 0.5
predictions = (torch.sigmoid(logits.logits) >= threshold).int()

# 'predictions' now contains the predicted classes for each ex

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
df

In [None]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=df["train_dataset"],
    eval_dataset=df["val_dataset"],
)

In [None]:
trainer.train()

### additionall model/CURRENT ISSUE

In [None]:
df = load_from_disk("bld/python/TrainTest/TrainTest_data/")

In [None]:
df

In [None]:
df["train_dataset"][0]

In [None]:
import pandas as pd
import torch
from datasets import DatasetDict
from torch import nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

# Initialize a pre-trained tokenizer and model
model_name = "bert-base-uncased"  # You can change this to your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

# Assuming df_enc contains the dataset in the correct format
# df_enc should look like this:
# DatasetDict({
#     train_dataset: Dataset({
#         features: ['input_ids', 'attention_mask', 'scores'],
#     val_dataset: Dataset({
#         features: ['input_ids', 'attention_mask', 'scores'],
#     test_dataset: Dataset({
#         features: ['input_ids', 'attention_mask', 'scores'],

# Define the loss function for multi-label classification (e.g., BCEWithLogitsLoss)
criterion = nn.BCEWithLogitsLoss()

# Convert text data to input tensors using the tokenizer
df["train_dataset"] = tokenizer(
    df["train_dataset"]["text"],
    padding=True,
    truncation=True,
    return_tensors="pt",
)
df["val_dataset"] = tokenizer(
    df["val_dataset"]["text"],
    padding=True,
    truncation=True,
    return_tensors="pt",
)

# Forward pass through the model to get logits
input_ids = df["train_dataset"]["input_ids"]
attention_mask = df["train_dataset"]["attention_mask"]
logits = model(input_ids, attention_mask=attention_mask)

# Assuming 'scores' is already in the correct format
scores = torch.tensor(df["train_dataset"]["scores"], dtype=torch.float32)

# Compute the loss
loss = criterion(logits.logits, scores)

# Define your training arguments and trainer and train the model
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # Adjust as needed
    evaluation_strategy="steps",
    eval_steps=500,  # Adjust as needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=None,  # You can specify a data collator if needed
    train_dataset=df_enc["train_dataset"],  # Use your train_dataset here
    eval_dataset=df_enc["val_dataset"],  # Use your val_dataset here
)

trainer.train()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Assuming df contains your dataset
labels = df["train_dataset"]["label"]

mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(labels)

In [None]:
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments,
)

# Load the pre-trained model and tokenizer
model_name = "bert-base-cased"
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(df["train_dataset"]["scores"][0]),
)
tokenizer = BertTokenizer.from_pretrained(model_name)


# Function to preprocess the dataset and return it in the required format
def preprocess_function(examples):
    # Tokenize the text and encode it into input features
    inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )

    # Convert scores to a tensor (assuming scores are already in the correct format)
    scores = torch.tensor(examples["scores"], dtype=torch.float32)

    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": scores,
    }


# Preprocess the datasets
train_dataset = df["train_dataset"].map(preprocess_function)
val_dataset = df["val_dataset"].map(preprocess_function)
test_dataset = df["test_dataset"].map(preprocess_function)

# Define your training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
)


# Define a function to compute metrics
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = (predictions > 0).astype(int)  # Convert logits to binary predictions
    f1 = f1_score(labels, predictions, average="micro")
    precision = precision_score(labels, predictions, average="micro")
    recall = recall_score(labels, predictions, average="micro")
    return {"f1_micro": f1, "precision_micro": precision, "recall_micro": recall}


# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Real Model

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import pipeline

In [None]:
df = load_from_disk("bld/python/TrainTest/TrainTest_data/")

In [None]:
df

In [None]:
train_labels = df["train_dataset"].select_columns(["scores"])

In [None]:
model_name = "bert-base-cased"

In [None]:
df

In [None]:
classifier = pipeline(
    "text-classification",
    model=model_name,
)

In [None]:
# preprocess because values are apparently string and not int

model_output = pd.DataFrame(classifier(df["val_dataset"]["text"]))

In [None]:
max_length = 512


def tokenize_data(df):
    return tokenizer(
        df["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


df_encoded = df.map(tokenize, batched=True, batch_size=None)
df_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
df_encoded.set_format("torch")
# df_encoded

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
).to(device)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", num_train_epochs=5)

In [None]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
df

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=df_encoded["train_dataset"],
    eval_dataset=df_encoded["val_dataset"],
)

In [None]:
trainer.train()

# ERROR FIXES

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi


def authenticate_to_kaggle():
    """Authenticate to Kaggle."""
    api = KaggleApi()
    api.authenticate()
    return api

In [None]:
api = authenticate_to_kaggle()
dataset = "hadasu92/cnn-articles-after-basic-cleaning"
api.dataset_download_files(dataset)

In [None]:
import kaggle

kaggle.api.dataset_download_files(
    dataset,
    path="./bld/python/data",
    unzip=True,
    quiet=False,
)

In [None]:
print(api.dataset_download_files(dataset))

In [None]:
task_load_data_python(path)

In [None]:
path = "bld/python/data/"

In [None]:
import zipfile


def task_load_data_python(path):
    """Clean the data (Python version).

    Download needs up to 5 minutes. Is this due to internet or coding issue?

    """
    api = authenticate_to_kaggle()
    dataset = "hadasu92/cnn-articles-after-basic-cleaning"
    api.dataset_download_files(dataset)
    with zipfile.ZipFile("cnn-articles-after-basic-cleaning.zip", "r") as zip_ref:
        zip_ref.extractall(path)

In [None]:
import pandas as pd

In [None]:
depends_on = r"src\EN\data_management\data_info.yaml"

In [None]:
df_1 = r"bld\python\data\CNN_Articels_clean\CNN_Articels_clean.csv"
df_2 = r"bld\python\data\CNN_Articels_clean_2\CNN_Articels_clean.csv"


df_1 = pd.read_csv(df_1)  # need to delete cache here
df_2 = pd.read_csv(df_2)
data = clean_data(df_1, df_2)
data.save_to_disk()

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict


def clean_data(data_1, data_2):
    """Clean data set.

    Information on data columns is stored in ``data_management/data_info.yaml``.

    Args:
        data (pandas.DataFrame): The data set.
        data_info (dict): Information on data set stored in data_info.yaml. The
            following keys can be accessed:
            - 'Index': Running number
            - 'Author': Author who wrote Article
            - 'Date published': Publishing date of Article
            - 'Category': Higher level category of Article
            - 'Section': Lower level category of Article
            - 'url': URL to data set
            - 'Headline': Headline of Article
            - 'Description': Short Summary of Article
            - 'Keywords': Keywords of Article
            - 'Second headline': Second Headline of Article
            - 'Article text': Full article text

    Returns:
        pandas.DataFrame: The cleaned data set.

    """
    if set(data_1.columns) != set(data_2.columns):
        msg = "Both datasets must have the same columns."
        raise ValueError(msg)
    merged_dataset = pd.concat([data_1, data_2], axis=0)
    # put this into task
    merged_dataset = _drop_columns(merged_dataset)
    return _pd_to_dataset(merged_dataset)


def _drop_columns(data):
    """Drop columns from data set.

    Args:
        data (pandas.DataFrame): The data set.
        columns_to_drop (list): List of columns to drop.

    Returns:
        pandas.DataFrame: The data set without the dropped columns.

    """
    data = data.drop(columns=data["Headline"])
    data = data.dropna()
    return data[~data["Category"].isin(data["sport"])]


def _pd_to_dataset(data):
    data = Dataset.from_pandas(data)
    dataset_dict = DatasetDict({"my_dataset": data})
    return dataset_dict["my_dataset"]

In [None]:
@pytask.mark.depends_on(
    {
        "scripts": ["load_data.py"],
    },
)
@pytask.mark.task
@pytask.mark.produces(
    BLD / "python" / "data",
)  # / "cnn-articles-after-basic-cleaning.zip"
def task_load_data(produces):
    """Clean the data (Python version)."""
    api = authenticate_to_kaggle()
    dataset = "hadasu92/cnn-articles-after-basic-cleaning"
    api.dataset_download_files(dataset)
    jo = kaggle.api.dataset_download_files(
        dataset,
        path=produces,
        unzip=True,
        quiet=False,
    )
    with open(produces) as f:
        f.write(jo)

        # @pytask.mark.depends_on(BLD / "python" / "data" / "cnn-articles-after-basic-cleaning.zip")


# @pytask.mark.produces(BLD / "python" / "data" / "cnn-articles-after-basic-cleaning.zip")
# def task_unzip(produces):
#    with zipfile.ZipFile("cnn-articles-after-basic-cleaning.zip", "r") as zip_ref:

# CHATGPT

In [None]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, pipeline

In [None]:
df = load_from_disk("bld/python/TrainTest/TrainTest_data/")

In [None]:
model_name = "bert-base-uncased"

In [None]:
classifier = pipeline(
    "text-classification",
    model=model_name,
)

In [None]:
model_output = pd.DataFrame(classifier(df["val_dataset"]["text"]))

In [None]:
from datasets import load_from_disk
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, return_dict=True)


def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


df_encoded = df.map(tokenize, batched=True, batch_size=None)
df_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
df_encoded.set_format("torch")
df_encoded = df_encoded.remove_columns("classes")

In [None]:
df_encoded

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoModelForSequenceClassification

model_config = AutoConfig.from_pretrained(model_name, return_dict=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=model_config,
).to(device)

bis hier hin wirkt gut

- 1 ab hier mal schauen, hat bis train geklappt

- 2 ab hier mal schauen, hat bis train geklappt

In [None]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import TrainingArguments

batch_size = 8
logging_steps = len(df_encoded["train_dataset"]) // batch_size

training_args = TrainingArguments(
    output_dir="results",
    optim="adamw_torch",
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=df_encoded["train_dataset"],
    eval_dataset=df_encoded["val_dataset"],
)

In [None]:
trainer

In [None]:
df_encoded

In [None]:
df_encoded["val_dataset"]["label"]

In [None]:
trainer.train()

another try


In [None]:
df

In [None]:
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer

# Define your model name
model_name = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, return_dict=True)


# Tokenize the dataset
def tokenize_text(example):
    return tokenizer(example["text"], padding=True, truncation=True)


df_encoded = df.map(tokenize_text, batched=True)
df_encoded.set_format("pandas")


# Convert labels to integer class indices
def convert_labels(dataset):
    dataset["label"] = dataset["label"].apply(
        lambda x: torch.argmax(torch.tensor(x)).item(),
    )
    return dataset


df_encoded["train_dataset"] = convert_labels(df_encoded["train_dataset"])
df_encoded["val_dataset"] = convert_labels(df_encoded["val_dataset"])
df_encoded["test_dataset"] = convert_labels(df_encoded["test_dataset"])

# Convert tokenized text data to numerical features using CountVectorizer
vectorizer = CountVectorizer(
    max_features=1000,
)  # You can adjust the number of features as needed
X_train = vectorizer.fit_transform(df_encoded["train_dataset"]["text"])
X_val = vectorizer.transform(df_encoded["val_dataset"]["text"])
X_test = vectorizer.transform(df_encoded["test_dataset"]["text"])

# Create and train a Random Forest classifier
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
)  # You can adjust hyperparameters as needed
clf.fit(X_train, df_encoded["train_dataset"]["label"])

# Make predictions on the validation set
val_preds = clf.predict(X_val)

# Evaluate the model
val_labels = df_encoded["val_dataset"]["label"]
val_f1 = f1_score(val_labels, val_preds, average="weighted")
val_accuracy = accuracy_score(val_labels, val_preds)

print(f"Validation F1 Score: {val_f1}")
print(f"Validation Accuracy: {val_accuracy}")

# Zusatz von den model_pad

In [None]:
class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels),
                labels.float().view(-1, self.num_labels),
            )

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss, *output)) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

### Subset Selection

In [None]:
import numpy as np
import pandas as pd
from datasets import load_from_disk

In [None]:
benchmark = pd.read_csv("bld/python/data/benchmark.csv")

In [None]:
benchmark["__index_level_0__"]

In [None]:
benchmark.shape

In [None]:
adwaw = pd.read_csv(r"src\EN\data\seed_42_classification.csv")

In [None]:
adwaw

In [None]:
whole_data = load_from_disk("bld/python/data/data_clean")

In [None]:
large_df = pd.DataFrame(whole_data)

In [None]:
df = select_random_entries(large_df)

In [None]:
df["__index_level_0__"]

In [None]:
def select_random_entries(dataframe, num_entries=50, random_state=42):
    """Select a random set of entries from a Pandas DataFrame.

    Parameters:
        dataframe (json): The input DataFrame with 6 columns.
        num_entries (int): The number of random entries to select (default is 50).
        random_state (int or None): Random seed for reproducibility (default is None).

    Returns:
        random_entries (pd.DataFrame): A DataFrame containing the randomly selected entries.

    """
    dataframe = pd.DataFrame(dataframe)

    if random_state is not None:
        np.random.seed(random_state)

    if num_entries > len(dataframe):
        msg = "Number of entries to select cannot exceed the total number of rows."
        raise ValueError(
            msg,
        )

    # Use Pandas' sample method to select random entries
    random_indices = np.random.choice(
        dataframe["__index_level_0__"],
        size=num_entries,
        replace=False,
    )
    return dataframe[dataframe["__index_level_0__"].isin(random_indices)]

In [None]:
import numpy as np
import pandas as pd


def select_random_entries(dataframe, num_entries=50, random_state=42):
    """Select a random set of entries from a Pandas DataFrame.

    Parameters:
        dataframe (json): The input DataFrame with 6 columns.
        num_entries (int): The number of random entries to select (default is 50).
        random_state (int or None): Random seed for reproducibility (default is None).

    Returns:
        random_entries (pd.DataFrame): A DataFrame containing exactly 50 randomly selected entries.

    """
    dataframe = pd.DataFrame(dataframe)

    if random_state is not None:
        np.random.seed(random_state)

    if num_entries > len(dataframe):
        msg = "Number of entries to select cannot exceed the total number of rows."
        raise ValueError(
            msg,
        )

    if len(dataframe) <= num_entries:
        # If the DataFrame has fewer or equal rows than num_entries, select all of them
        random_entries = dataframe
    else:
        # Use Pandas' sample method to select num_entries random entries
        random_indices = np.random.choice(
            dataframe["__index_level_0__"],
            size=num_entries,
            replace=False,
        )
        random_entries = dataframe[dataframe["__index_level_0__"].isin(random_indices)]

    # If the selected random_entries has fewer than num_entries rows, select more randomly
    while len(random_entries) < num_entries:
        additional_indices = np.random.choice(
            dataframe["__index_level_0__"],
            size=num_entries - len(random_entries),
            replace=False,
        )
        additional_entries = dataframe[
            dataframe["__index_level_0__"].isin(additional_indices)
        ]
        random_entries = pd.concat([random_entries, additional_entries])

    return random_entries.sample(n=num_entries, random_state=random_state)

In [None]:
# save for maybe latler


@pytask.mark.depends_on(
    {
        "scripts": ["clean_data.py"],
        "data_info": SRC / "data_management" / "data_info.yaml",
        "Article_1": BLD
        / "python"
        / "data"
        / "CNN_Articels_clean"
        / "CNN_Articels_clean.csv",
        "Article_2": BLD
        / "python"
        / "data"
        / "CNN_Articels_clean_2"
        / "CNN_Articels_clean.csv",
        "Seed42_hand_classification": SRC / "data" / "seed_42_classification.csv",
    },
)
@pytask.mark.produces(BLD / "python" / "data" / "benchmark.csv")
def task_select_data(depends_on, produces):
    "Subset the data to 50 entries and add the hand classification."
    df_1 = pd.read_csv(depends_on["Article_1"])  # need to delete cache here
    df_2 = pd.read_csv(
        depends_on["Article_2"],
    )
    data_info = read_yaml(depends_on["data_info"])
    data = clean_data(df_1, df_2, data_info)
    data = select_random_entries(data, num_entries=50, random_state=42)

    data.to_csv(produces)

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-lock 2.1.2 requires click>=8.0, but you have click 7.1.2 which is incompatible.