## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda')

In [3]:
x = torch.randn(30)
y = torch.randn(30)

z = torch.cat([x.reshape(-1,1), y.reshape(-1,1)], 1)
print(x.shape, y.shape, z.shape)

torch.Size([30]) torch.Size([30]) torch.Size([30, 2])


## Loading Dataset

In [4]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [5]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [6]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [7]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [8]:
# renaming_dict = {"text_content": "text", "status": "label"}
renaming_dict = {"status": "labels"}

train_dataset = train_dataset.rename_columns(renaming_dict)
validation_dataset = validation_dataset.rename_columns(renaming_dict)
test_dataset = test_dataset.rename_columns(renaming_dict)

### Tokenizing Data

In [9]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

# We create our normalizer which will appy Unicode normalization and strip accents
normalizer = normalizers.Sequence([NFD(), StripAccents()])

normalizer.normalize_str("Héllò? What aré yòü üptò tòday?")

from tokenizers.pre_tokenizers import Whitespace

# We create our pre-tokenizer which will split based on the regex \w+|[^\w\s]+
pre_tokenizer = Whitespace()

In [10]:
from tokenizers.models import WordPiece
from tokenizers import Tokenizer

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer

In [11]:
from tokenizers.trainers import WordPieceTrainer
import time

# We will create a batch iterator which will generate a batch of sentences for training
# our tokenizaer. This is the preferred way instead of passing single sentences to the
# tokenizer as it will a lot faster.
def batch_iterator(dataset, batch_size=10000):
  for i in range(0, len(dataset), batch_size):
    lower_idx = i
    # Ensure the upper idx doesn't overflow leading to an 'IndexError'
    upper_idx = i + batch_size if i + batch_size <= len(dataset) else len(dataset)
    text = dataset[lower_idx : upper_idx]["title"] + ' ' + dataset[lower_idx : upper_idx]["text_content"]
    yield text
        
# We pass in the list of special tokens so that our model knows about them.
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tic = time.perf_counter()
# Now, we do batch training based on our iterator that we defined earlier.
tokenizer.train_from_iterator(batch_iterator(full_dataset), trainer=trainer, length=len(full_dataset))
toc = time.perf_counter()
print(f"Elapsed time: {toc - tic:0.4f} seconds")

Elapsed time: 19.2933 seconds


In [12]:
label2id
[label2id[label] for label in train_dataset['labels'][:10]]

[3, 3, 0, 0, 0, 3, 0, 1, 2, 0]

In [13]:
CONTENT_MAX_TEXT_CONTENT = 128
TITLE_MAX_TEXT_CONTENT = 32

def tokenize_func(batch):
  tokenized_batch = dict()
  
  tokenized_batch['labels'] = [label2id[label] for label in batch['labels']]
  
  tokenizer.enable_truncation(max_length=CONTENT_MAX_TEXT_CONTENT)
  tokenizer.enable_padding(length=CONTENT_MAX_TEXT_CONTENT)
  content_out = tokenizer.encode_batch(batch['text_content'])
  
  tokenizer.enable_truncation(max_length=TITLE_MAX_TEXT_CONTENT)
  tokenizer.enable_padding(length=TITLE_MAX_TEXT_CONTENT)
  title_out = tokenizer.encode_batch(batch['title'])
  
  tokenized_batch['content_input_ids'] = torch.IntTensor(list(map(lambda x: x.ids, content_out)))
  tokenized_batch['title_input_ids'] = torch.IntTensor(list(map(lambda x: x.ids, title_out)))
  # print(title)
  
  return tokenized_batch

In [14]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:19<00:00, 5107.45 examples/s]
Map: 100%|██████████| 35068/35068 [00:07<00:00, 4920.99 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 5087.46 examples/s]


In [15]:
tokenized_train_dataset

Dataset({
    features: ['tags_onehot', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days', 'title', 'text_content', 'labels', 'content_input_ids', 'title_input_ids'],
    num_rows: 98191
})

In [16]:
columns_to_remove = [
  # 'tags_onehot',
  # 'unrecognized_tags_count',
  # 'reputation',
  # 'undeleted_answers',
  # 'user_life_days',
  'title',
  'text_content',
]

tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(columns_to_remove)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

## Model - Custom

In [17]:
from custom_model import AutoCompositeModel

model = AutoCompositeModel(device).to(device)

## Training Setup

### Data Loaders

In [18]:
def collate_func(batch):
  fixed_batch = pd.DataFrame(batch).to_dict(orient="list")
  # print(fixed_batch)

  return fixed_batch

In [40]:
from torch.utils.data import DataLoader
training_loader = DataLoader(tokenized_train_dataset, batch_size=128, shuffle=True, collate_fn=collate_func)
validation_loader = DataLoader(tokenized_validation_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(tokenized_test_dataset, batch_size=128, shuffle=True)

In [41]:
print(tokenized_train_dataset['title_input_ids'][0])
print(len(tokenized_train_dataset['title_input_ids'][0]))

[70, 1711, 18387, 4272, 2991, 6559, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32


### Loss Function

In [42]:
loss_fn = torch.nn.CrossEntropyLoss()

In [43]:
# tokenized_train_dataset[0]

### Optimizer

In [44]:
# Optimizers specified in the torch.optim package
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5, momentum=0.8)

In [45]:
from torchmetrics import Accuracy

accuracy_metric = Accuracy(task='multiclass', num_classes=5).to(device)

### Trainer Setup

In [46]:
from training_own import Trainer, TrainerConfiguration, get_model_params

config = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)

In [47]:
get_model_params(model)

621

In [48]:
type(train_dataset[0]['reputation'])

int

In [49]:
trainer = Trainer(model=model, trainer_configuration=config, input_columns=['content_input_ids', 'title_input_ids', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days'], output_column='labels')

In [50]:
len(training_loader)

768

In [51]:
# trainer.train_one_epoch(logging_frequency=100, evaluate_when_logging=False)

In [52]:

# for i, data in enumerate(training_loader):
#   if i > 0:
#     break
    
#   subset = dict((k, data[k]) for k in ('content_input_ids', 'title_input_ids', 'reputation', 'undeleted_answers', 'user_life_days'))
#   print(subset)
  # text_content = torch.tensor(data['content_input_ids'])
  # print(text_content[0])
  # print(type(text_content[0]))
  
  

In [53]:
# validation_dataset['tags_onehot']

In [54]:
trainer.train_many_epochs(epochs=8, logging_frequency=100, evaluate_when_logging=False)

EPOCH 1:


 batch 100 training_loss: 1.4575096094608306 training_accuracy: 0.49367186427116394
 batch 200 training_loss: 1.464114441871643 training_accuracy: 0.48835936188697815
 batch 300 training_loss: 1.460557327270508 training_accuracy: 0.49406248331069946
 batch 400 training_loss: 1.4604470813274384 training_accuracy: 0.4925781190395355
 batch 500 training_loss: 1.453369024991989 training_accuracy: 0.4918749928474426
 batch 600 training_loss: 1.4505170667171479 training_accuracy: 0.4996093511581421
 batch 700 training_loss: 1.4551690006256104 training_accuracy: 0.4990624785423279


  vlabels = torch.tensor(vdata[self.output_column]).long().to(self.config.device)


LOSS train 1.4551690006256104 valid 1.4543185234069824 ACCURACY train 0.49407076835632324 validation 0.49729862809181213
EPOCH 2:
 batch 100 training_loss: 1.4540981805324555 training_accuracy: 0.492499977350235
 batch 200 training_loss: 1.4532121455669402 training_accuracy: 0.4953906238079071
 batch 300 training_loss: 1.4530854606628418 training_accuracy: 0.4891405999660492
 batch 400 training_loss: 1.4466507160663604 training_accuracy: 0.500781238079071
 batch 500 training_loss: 1.4552346932888032 training_accuracy: 0.4914843738079071
 batch 600 training_loss: 1.4526555836200714 training_accuracy: 0.49367186427116394
 batch 700 training_loss: 1.4487352454662323 training_accuracy: 0.498359352350235


  vlabels = torch.tensor(vdata[self.output_column]).long().to(self.config.device)


LOSS train 1.4487352454662323 valid 1.4489623308181763 ACCURACY train 0.49469810724258423 validation 0.49654531478881836
EPOCH 3:
 batch 100 training_loss: 1.4464309322834015 training_accuracy: 0.4921875
 batch 200 training_loss: 1.4477602052688598 training_accuracy: 0.4951562285423279
 batch 300 training_loss: 1.443357652425766 training_accuracy: 0.4992968738079071
 batch 400 training_loss: 1.4475304186344147 training_accuracy: 0.49335935711860657
 batch 500 training_loss: 1.446607027053833 training_accuracy: 0.49757811427116394
 batch 600 training_loss: 1.4483110284805298 training_accuracy: 0.4945312440395355
 batch 700 training_loss: 1.4462873494625093 training_accuracy: 0.4893749952316284


  vlabels = torch.tensor(vdata[self.output_column]).long().to(self.config.device)


LOSS train 1.4462873494625093 valid 1.4437428712844849 ACCURACY train 0.49478965997695923 validation 0.4966612160205841
EPOCH 4:
 batch 100 training_loss: 1.4424825954437255 training_accuracy: 0.4951562285423279
 batch 200 training_loss: 1.4403975009918213 training_accuracy: 0.5
 batch 300 training_loss: 1.4480377483367919 training_accuracy: 0.48679685592651367
 batch 400 training_loss: 1.4400092935562134 training_accuracy: 0.4966406226158142
 batch 500 training_loss: 1.437623312473297 training_accuracy: 0.4989062249660492
 batch 600 training_loss: 1.4363204324245453 training_accuracy: 0.500781238079071
 batch 700 training_loss: 1.4454254925251007 training_accuracy: 0.48234373331069946


KeyboardInterrupt: 