In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from IPython.display import clear_output
!pip install peft==0.8.2
!pip install bitsandbytes==0.42.0
!pip install accelerate==0.26.1
!pip install datasets==2.16.1
!pip install GPUtil
!pip install transformers==4.38.0
clear_output()

In [None]:
import warnings
import os
from transformers import set_seed

SEED = 123
set_seed(SEED)

warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

INPUT_DIR = '/kaggle/input/banfake'

DIR = '/kaggle/working/'

NUM_WORKERS = os.cpu_count()
NUM_CLASSES = 2

EPOCHS,R,LORA_ALPHA,LORA_DROPOUT = 5,64,32,0.1
BATCH_SIZE = 2

MODEL_ID = ''#model name

## Load the Dataset

In [None]:
from datasets import load_dataset, load
train_file_path = "/kaggle/input/banfake2/train.csv"
eval_file_path = '/kaggle/input/banfake2/eval.csv'
dataset = load_dataset(
    'csv',
    data_files={'train': train_file_path, 'test': eval_file_path},
)


# dataset = dataset.rename_column("Label", "label")
# dataset = dataset.rename_column("Data", "text")
print(dataset, dataset.keys())
dataset["train"][0], dataset['test'][0], dataset['train'][:5]

## Baseline Accuracy

In [None]:
from collections import Counter

train_len, test_len = len(dataset['train']), len(dataset['test'])

train_dataset_label_counts = Counter(dataset['train']['label'])
test_dataset_label_counts = Counter(dataset['test']['label'])

print(f"Train dataset: {train_len} samples, {train_dataset_label_counts}")
print(f"Test dataset: {test_len} samples, {test_dataset_label_counts}")

test_majority_class = test_dataset_label_counts.most_common(1)[0]

baseline_accuracy = test_majority_class[1] / test_len

print(f"Baseline accuracy: {baseline_accuracy:.2%}")

## Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(tokenizer.padding_side, tokenizer.pad_token)

In [None]:
tokenized_dataset = {}

for split in dataset.keys():
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["text"], truncation=True,max_length=20), batched=True
    )
    

tokenized_dataset["train"], tokenized_dataset["test"]

## Load and set up the model

In [None]:
from transformers import AutoModelForSequenceClassification, GPTNeoXForCausalLM

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_CLASSES,
    load_in_8bit=True,
)
model.config.pad_token_id = model.config.eos_token_id

print(model.config.pad_token_id)

In [None]:
print(model)

### Vanilla Model to LoRA Model

In [None]:
!pip install --upgrade peft


In [None]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

model

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,
    target_modules='all-linear'
)
lora_config

In [None]:
lora_model = get_peft_model(model, lora_config)
lora_model

In [None]:
lora_model.print_trainable_parameters()

## Train the Model

In [None]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_steps=10,
        report_to="none"
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
print("Evaluating the Model Before Training!")
trainer.evaluate()

In [None]:
print("Training the Model")
trainer.train()

## Evaluate the Model

In [None]:
print("Evaluating the trained model")
trainer.evaluate()

In [None]:
print("Saving the model!")
lora_model.save_pretrained('fine-tuned-model')

## Making Predictions

In [None]:
from transformers import pipeline

clf = pipeline("text-classification", lora_model, tokenizer=MODEL_ID)

In [None]:
import pandas as pd

test_df = pd.read_csv(f"/kaggle/input/banfake2/test.csv")

display(test_df.head())

In [None]:
from tqdm import tqdm
import torch

predictions = []

print("Making prediction on the test dataset...")

for text in tqdm(test_df['text_column'].values):

    prediction=clf(text)
    prediction = int(prediction[0]['label'].split('_')[1])
    predictions.append(prediction)



In [None]:
from sklearn.metrics import classification_report

true_labels = test_df['label'].values 

report = classification_report(true_labels, predictions, digits = 4)

print(report)

In [None]:
import pandas as pd
predictions_df = pd.DataFrame({
    'text': test_df['text'].values,
    'label': test_df['label'].values,
    'predictions': predictions
})

print(predictions_df)

predictions_df.to_csv('predictions.csv', index=False)
