## Loading Libraries

In [66]:
import pandas as pd
import numpy as np
import torch
import evaluate

In [67]:
device = torch.device('cuda')

## Loading Dataset

In [68]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [69]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [70]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [71]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [72]:
train_dataset = train_dataset.rename_columns({"text_content": "text", "status": "label"})
validation_dataset = validation_dataset.rename_columns({"text_content": "text", "status": "label"})
test_dataset = test_dataset.rename_columns({"text_content": "text", "status": "label"})

columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  'reputation',
  'undeleted_answers',
  'user_life_days',
  'title'
]

train_dataset = train_dataset.remove_columns(columns_to_remove)
validation_dataset = validation_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

### Tokenizing Data

In [73]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [74]:
MAX_TEXT_CONTENT = 128

def tokenize_func(batch):
  tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch["labels"] = [label2id[label] for label in batch["label"]]
  return tokenized_batch

In [75]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:15<00:00, 6323.46 examples/s]
Map: 100%|██████████| 35068/35068 [00:05<00:00, 6141.25 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 6013.00 examples/s]


#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

## Model - Custom

In [76]:
from custom_model import AutoCompositeModel

model = AutoCompositeModel(device).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
model.my_new_layers[0].weight.dtype

torch.float32

In [78]:
input = torch.tensor(tokenized_test_dataset['input_ids'][:10]).float().to(device)
input.dtype

torch.float32

## Training Setup

### Data Loaders

In [79]:
from torch.utils.data import DataLoader
training_loader = DataLoader(tokenized_train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(tokenized_validation_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_test_dataset, batch_size=16, shuffle=True)

### Loss Function

In [80]:
loss_fn = torch.nn.CrossEntropyLoss()

### Optimizer

In [81]:
# Optimizers specified in the torch.optim package
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [82]:
from torchmetrics import Accuracy

accuracy_metric = Accuracy(task='multiclass', num_classes=5).to(device)

### Trainer Setup

In [83]:
from training_own import Trainer, TrainerConfiguration

config = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)

In [84]:
trainer = Trainer(model=model, trainer_configuration=config, input_column='input_ids', output_column='labels')

In [85]:
# for i, data in enumerate(training_loader):
#   if i > 0: break
#   # print(i, torch.FloatTensor(data['input_ids']))
  
#   # a = [torch.FloatTensor([1]).view(1, -1), torch.FloatTensor([2]).view(1, -1)]
#   stacked = torch.stack(data['input_ids']).long()
#   print(stacked.dtype)

#   # model(data['input_ids'])
  

In [86]:
len(training_loader)

6137

In [87]:
trainer.train_one_epoch(logging_frequency=100, evaluate_when_logging=False)

 batch 100 training_loss: 1.533371275663376 training_accuracy: 0.42249998450279236
 batch 200 training_loss: 1.3795962047576904 training_accuracy: 0.4949999749660492
 batch 300 training_loss: 1.2972184842824936 training_accuracy: 0.5099999904632568
 batch 400 training_loss: 1.2866777056455612 training_accuracy: 0.503125011920929
 batch 500 training_loss: 1.2555399358272552 training_accuracy: 0.49687498807907104
 batch 600 training_loss: 1.2619350105524063 training_accuracy: 0.49937498569488525
 batch 700 training_loss: 1.2167215526103974 training_accuracy: 0.5074999928474426
 batch 800 training_loss: 1.2119418865442275 training_accuracy: 0.5112499594688416
 batch 900 training_loss: 1.1912512242794038 training_accuracy: 0.53125
 batch 1000 training_loss: 1.2086771321296692 training_accuracy: 0.5081250071525574
 batch 1100 training_loss: 1.1519911015033721 training_accuracy: 0.5531249642372131
 batch 1200 training_loss: 1.1615573501586913 training_accuracy: 0.5324999690055847
 batch 1300

(0.9127078834176063, tensor(0.5987, device='cuda:0'))