In [1]:
!pip install portalocker
!pip install transformers
!pip install datasets



In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.nn.functional import cross_entropy

In [3]:
# Preprocess Data
class MovieDataset(Dataset):
    def __init__(self, texts, genres, tokenizer, max_len):
        self.text = texts
        self.genres = genres
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.genres)

    def __getitem__(self, item):
        text = str(self.text[item])
        genre = self.genres[item]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(genre, dtype=torch.long)
        }

In [4]:
datasetRAW = pd.read_csv('train.csv')
datasetRAW.head(5)

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [5]:
genre = datasetRAW['genre'].tolist()
synop = datasetRAW['synopsis'].tolist()
name = datasetRAW['movie_name'].tolist()
nameSynop = []

for i in range(len(genre)):
  nameSynop.append(name[i].lower() + " : " + synop[i].lower())

In [6]:
genreLabels = []
cnt = 0
GenreToLabel = dict()
for curGenre in genre:
  if curGenre not in GenreToLabel:
    GenreToLabel[curGenre] = cnt
    cnt = cnt + 1
  genreLabels.append(GenreToLabel[curGenre])

In [7]:
datasetDict = dict()
datasetDict = {'label': genreLabels, 'text': nameSynop}

In [12]:
# Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
MAX_LEN = 300

In [13]:
# Split data
train_texts, test_texts, train_genres, test_genres = train_test_split(
    datasetDict['text'], datasetDict['label'], test_size=0.2
)

In [14]:
# Dataset
train_dataset = MovieDataset(train_texts, train_genres, tokenizer, MAX_LEN)
test_dataset = MovieDataset(test_texts, test_genres, tokenizer, MAX_LEN)

# Data Loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Model
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=len(set(datasetDict['label'])))
model.config.pad_token_id = model.config.eos_token_id
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=10, bias=False)
)

In [1]:
# Training
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

NameError: ignored

In [None]:
# Evaluation
model.eval()
all_preds = []
all_true = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_true, all_preds)
print(f"Accuracy: {accuracy}")