In [1]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4
!pip install textblob
!pip install mlxtend
!pip install googletrans==3.1.0a0
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.46.3
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting mu

In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification,DataCollatorWithPadding,TrainingArguments,Trainer,AutoModelForCausalLM,DataCollatorForLanguageModeling
from peft import LoraConfig,get_peft_model,prepare_model_for_kbit_training
import torch
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [33]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Movie-Data.csv", usecols=["Description", "Genre"])
df.head()

Unnamed: 0,Genre,Description
0,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...
1,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te..."
2,"Horror,Thriller",Three girls are kidnapped by a man with a diag...
3,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea..."
4,"Action,Adventure,Fantasy",A secret government agency recruits some of th...


In [34]:
df['Genre']=df['Genre'].str.split(',')

In [35]:
multilabel=MultiLabelBinarizer()

labels=multilabel.fit_transform(df['Genre']).astype('float32')
texts=df['Description'].tolist()

In [36]:
X_train,X_test,y_train,y_test=train_test_split(texts,labels,test_size=0.2,random_state=42)

In [37]:
model_name='distilbert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(model_name,add_prefix_space=True)
model=AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=len(labels[0]),problem_type='multi_label_classification')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
class CustomDataset(Dataset):
  def __init__(self,text,label,tokenizer,max_length=300):
    self.text=text
    self.label=label
    self.tokenizer=tokenizer
    self.max_length=max_length

  def __len__(self):
    return len(self.text)

  def __getitem__(self,idx):
    text=str(self.text[idx])
    label=torch.tensor(self.label[idx])

    encoding=self.tokenizer(text,return_tensors='pt',padding='max_length',truncation=True)
    return{
        'input_ids':encoding['input_ids'].flatten(),
        'attention_mask':encoding['attention_mask'].flatten(),
        'label':label
    }

In [54]:
train_dataset=CustomDataset(X_train,y_train,tokenizer)
eval_dataset=CustomDataset(X_test,y_test,tokenizer)

In [64]:
from transformers import EvalPrediction

def multi_label_metrics(logits,labels):
  sigmoid=torch.nn.Sigmoid()
  preds=sigmoid(torch.tensor(logits))

  y_pred=np.zeros(preds.shape)
  y_pred[np.where(preds > 0.3)]=1
  y_true=labels


  f1 = f1_score(y_true, y_pred, average = 'macro')
  accuracy=accuracy_score(y_true,y_pred)

  metrics = {
      "accuracy":accuracy,
      "f1": f1
  }

  return metrics

In [65]:
def compute_metrics(p:EvalPrediction):
  preds=p.predictions[0] if isinstance(p.predictions,tuple()) else p.predictions
  labels=p.label_ids
  result=multi_label_metrics(preds,labels)
  return result

In [66]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [67]:
args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir = './results',
    num_train_epochs=5,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = eval_dataset,
                  compute_metrics=compute_metrics,
                  data_collator=data_collator)



In [68]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.258679,0.05,0.332754
2,No log,0.239331,0.1,0.432209
3,0.175800,0.24462,0.13,0.430993
4,0.175800,0.242203,0.1,0.445825
5,0.099400,0.247443,0.13,0.431334


TrainOutput(global_step=1000, training_loss=0.1375798225402832, metrics={'train_runtime': 258.764, 'train_samples_per_second': 15.458, 'train_steps_per_second': 3.865, 'total_flos': 530039685120000.0, 'train_loss': 0.1375798225402832, 'epoch': 5.0})

In [74]:
text = "Carol Danvers gets her powers entangled with those of Kamala Khan and Monica Rambeau, forcing them to work together to save the universe."
encoded=tokenizer(text,return_tensors='pt').to('cuda')
outputs=model(**encoded)

In [82]:
sigmoid=torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu())
preds=np.zeros(probs.shape)
preds[np.where(probs > 0.1)]=1

multilabel.inverse_transform(preds.reshape(1,-1))

[('Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy')]