In [1]:
pip install transformers


[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


In [3]:
MAX_LEN = 512
roberta_checkpoint = "roberta-large"
mistral_checkpoint = "mistralai/Mistral-7B-v0.1"
llama_checkpoint = "Mikael110/llama-2-7b-guanaco-fp16"

In [5]:
pip install accelerate -U

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
from transformers import AutoModelForSequenceClassification # Load a pre-trained model with a sequence classification header
import torch
llama_model =  AutoModelForSequenceClassification.from_pretrained(
  pretrained_model_name_or_path=llama_checkpoint,
  num_labels=3,
  device_map="auto",
  offload_folder="offload",
  trust_remote_code=True
)

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards:  50%|█████     | 1/2 [00:05<00:05,  5.42s/it]


KeyboardInterrupt: 

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(llama_checkpoint, use_fast = True)


In [8]:
pip install scikit-learn

[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
pip install pandas

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [7]:
df = pd.read_csv("cleaned_data.csv", encoding='ISO-8859-1')


In [8]:
df['text'].fillna("Missing text", inplace=True)  # Replace nulls with a placeholder string

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna("Missing text", inplace=True)  # Replace nulls with a placeholder string


In [9]:
# Check data types in the text column
print(df['text'].apply(type).value_counts())

text
<class 'str'>    38448
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

# Split data into training and remaining data
train_df, remaining_df = train_test_split(df, test_size=0.2, random_state=42)

# Split remaining data into validation and test sets
val_df, test_df = train_test_split(remaining_df, test_size=0.5, random_state=42)

In [11]:
import pandas as pd
import numpy as np

# Display unique values before cleaning
print("Unique labels before cleaning:", train_df['label'].unique())

# Clean labels: Only keep valid categories, set others to NaN
valid_labels = ['Left Wing', 'Right Wing', 'Neutral']
train_df['label'] = train_df['label'].apply(lambda x: x if x in valid_labels else np.nan)
val_df['label'] = val_df['label'].apply(lambda x: x if x in valid_labels else np.nan)
test_df['label'] = test_df['label'].apply(lambda x: x if x in valid_labels else np.nan)

# Option to drop NaNs if your dataset allows
# train_df.dropna(subset=['label'], inplace=True)
# val_df.dropna(subset=['label'], inplace=True)
# test_df.dropna(subset=['label'], inplace=True)

# Display unique values after cleaning
print("Unique labels after cleaning:", train_df['label'].unique())

Unique labels before cleaning: ['Left Wing' 'Neutral' 'Right Wing' nan ' whenever I leave the West'
 '01/25/2022 18:45:00'
 ' and that I may be better off then they are because I still have elders that I can go to who will make me feel at home for a while as they cleanse me. Sometimes I find myself wondering']
Unique labels after cleaning: ['Left Wing' 'Neutral' 'Right Wing' nan]


In [12]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Fit the encoder on the training data and transform all datasets
train_df['label'] = encoder.fit_transform(train_df['label'].astype(str))
val_df['label'] = encoder.transform(val_df['label'].astype(str))
test_df['label'] = encoder.transform(test_df['label'].astype(str))

# Check transformed labels
print("Encoded labels:", train_df['label'].unique())

Encoded labels: [0 1 2 3]


In [13]:
import torch

def tokenize_data(df):
    texts = df['text'].astype(str).tolist()  # Ensure text data is in string format
    labels = df['label'].tolist()  # Extract labels from the DataFrame

    # Tokenize the text data and ensure it is properly padded and truncated
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")

    # Add the labels to the tokenized data structure
    tokenized['labels'] = torch.tensor(labels, dtype=torch.long)  # Ensure labels are in a tensor format

    return tokenized


In [14]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


In [15]:
from torch.utils.data import DataLoader

train_dataset = tokenize_data(train_df)  # Assuming train_df is your DataFrame with training data
train_dataset = TextDataset(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [16]:
# model_id = 'Mikael110/llama-2-7b-guanaco-fp16'

In [17]:
from transformers import AutoConfig

# Manually define class names if they are known
class_names = ['LeftWing', 'Neutral', 'RightWing', 'nan']  # replace with your actual class names
num_labels=len(class_names)
# Create id2label mapping
id2label = {i: name for i, name in enumerate(class_names)}
config = AutoConfig.from_pretrained(model_id, num_labels=len(class_names), id2label=id2label)
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

number of labels: 4
the labels: ['LeftWing', 'Neutral', 'RightWing', 'nan']




In [18]:
repository_id = 'harshal-11/Llama-7b-PoliticalBias-Finetune'

In [19]:
#Define evaluation metrics
import evaluate
import numpy as np
def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores. 
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [38]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=repository_id,  # Ensure this directory exists or is creatable
    num_train_epochs=5,
    per_device_train_batch_size=8,
    
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir=f'./{repository_id}/logs',  # Ensure this directory exists or is creatable
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to = "none"
    # sharded_ddp = False,
    # Change to "tensorboard" if you want to use TensorBoard
    # push_to_hub=False,  # Set to True if you want to push to Hugging Face Hub
    # Uncomment and add your Hugging Face Hub credentials and configurations if needed
)



In [21]:
val_dataset = TextDataset(val_df)

In [34]:
from transformers import AutoModelForSequenceClassification

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_id)


In [39]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [36]:
trainer.train()

/opt/conda/conda-bld/pytorch_1702400410390/work/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
/opt/conda/conda-bld/pytorch_1702400410390/work/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, &fbeta, c, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`

In [25]:
import torch
torch.cuda.empty_cache()
