<a href="https://colab.research.google.com/github/Shakilkhan24/Playground_DL/blob/main/pytorch_fine_tuning_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FINETUNING WITH PYTORCH



In [None]:
!pip install transformers


In [2]:
!pip install evaluate
!pip install datasets

In [29]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



```
 PREPARING CUSTOM DATA FOR FINE TUNING HUGGING FACE MODDL
```



# preprocessing methods
AutoTokenizer (text) , AutoImageProcessor(image),

AutoFeatureExtractor(audio) ,AutoProcessor(multi-model)

In [27]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # use tokenizer to tokenize
        encoding = self.tokenizer(row['text'], add_special_tokens=True, padding='max_length', truncation=True, max_length=128, return_attributions=False, return_tensors='pt')
        label = torch.tensor(row['label'])    # tensor
        return {'input': encoding, 'label': label}



In [None]:
def prepare_data(filepath):
    df = pd.read_csv(filepath)
    X_train, X_val, y_train, y_val = train_test_split(df['text'].values, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'])
    # splitting data into x_train,x_test,y_train,y_test
    return {'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val}





```
TRAINING AND PREDICTING
```



# tqdm and learning_rate_schedular

In [28]:

def main(filepath):
    raw_data = prepare_data(filepath)
    train_data = MyDataset(pd.DataFrame({'text': raw_data['X_train'], 'label': list(raw_data['y_train'])}))
    val_data = MyDataset(pd.DataFrame({'text': raw_data['X_val'], 'label': list(raw_data['y_val'])}))

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    num_epochs = 3
    total_steps = len(train_loader) * num_epochs

    lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-5, steps_per_epoch=total_steps, pct_start=0.3, cycle_momentum=True, anneal_strategy='cos', div_factor=25.0, final_div_factor=10000.0, last_epoch=-1, verbose=True)

    global_step = 0

    for epoch in range(num_epochs):
        print(f"\n\nEpoch {epoch + 1}/{num_epochs}\n-------------------------------")
        for _, batch in enumerate(train_loader):
            input_ids = batch['input']['input_ids'].to(device)
            attention_mask = batch['input']['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            global_step += 1
            if global_step % 10 == 0:
                print(f"Global Step: {global_step}, Loss: {loss.item()}")

        evaluations_accuracies = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input']['input_ids'].to(device)
                attention_mask = batch['input']['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                probs = torch.nn.functional.softmax(outputs.logits, dim=1)
                pred = torch.argmax(probs, dim=1)

                evaluations_accuracies.append(accuracy_score(labels.cpu().detach().numpy(), pred.cpu().detach().numpy()))

        avg_evaluation_accuracy = sum(evaluations_accuracies)/len(evaluations_accuracies)
        print(f"Validation Average Accuracy: {avg_evaluation_accuracy}")

        lr_scheduler.step()

if __name__ == "__main__":
    # main('/path/to/custom/dataset.csv')
    pass