## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda')

In [3]:
x = torch.randn(30)
y = torch.randn(30)

z = torch.cat([x.reshape(-1,1), y.reshape(-1,1)], 1)
print(x.shape, y.shape, z.shape)

torch.Size([30]) torch.Size([30]) torch.Size([30, 2])


## Loading Dataset

In [4]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [5]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [6]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [7]:
from collections import Counter

full_dataset[0]
train_classes = [label['status'] for label in full_dataset]
Counter(train_classes)
# train_classes

Counter({'open': 70136,
         'not a real question': 30789,
         'off topic': 17530,
         'not constructive': 15659,
         'too localized': 6158})

In [8]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [9]:
# renaming_dict = {"text_content": "text", "status": "label"}
renaming_dict = {"status": "labels"}

train_dataset = train_dataset.rename_columns(renaming_dict)
validation_dataset = validation_dataset.rename_columns(renaming_dict)
test_dataset = test_dataset.rename_columns(renaming_dict)

### Tokenizing Data

In [10]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

# We create our normalizer which will appy Unicode normalization and strip accents
normalizer = normalizers.Sequence([NFD(), StripAccents()])

normalizer.normalize_str("Héllò? What aré yòü üptò tòday?")

from tokenizers.pre_tokenizers import Whitespace

# We create our pre-tokenizer which will split based on the regex \w+|[^\w\s]+
pre_tokenizer = Whitespace()

In [11]:
from tokenizers.models import WordPiece
from tokenizers import Tokenizer

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer

In [12]:
from tokenizers.trainers import WordPieceTrainer
import time

# We will create a batch iterator which will generate a batch of sentences for training
# our tokenizaer. This is the preferred way instead of passing single sentences to the
# tokenizer as it will a lot faster.
def batch_iterator(dataset, batch_size=10000):
  for i in range(0, len(dataset), batch_size):
    lower_idx = i
    # Ensure the upper idx doesn't overflow leading to an 'IndexError'
    upper_idx = i + batch_size if i + batch_size <= len(dataset) else len(dataset)
    text = dataset[lower_idx : upper_idx]["title"] + ' ' + dataset[lower_idx : upper_idx]["text_content"]
    yield text
        
# We pass in the list of special tokens so that our model knows about them.
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tic = time.perf_counter()
# Now, we do batch training based on our iterator that we defined earlier.
tokenizer.train_from_iterator(batch_iterator(full_dataset), trainer=trainer, length=len(full_dataset))
toc = time.perf_counter()
print(f"Elapsed time: {toc - tic:0.4f} seconds")

Elapsed time: 19.5653 seconds


In [13]:
label2id
[label2id[label] for label in train_dataset['labels'][:10]]

[4, 0, 0, 0, 3, 3, 1, 0, 0, 1]

In [14]:
CONTENT_MAX_TEXT_CONTENT = 128
TITLE_MAX_TEXT_CONTENT = 32

def tokenize_func(batch):
  tokenized_batch = dict()
  
  tokenized_batch['labels'] = [label2id[label] for label in batch['labels']]
  
  tokenizer.enable_truncation(max_length=CONTENT_MAX_TEXT_CONTENT)
  tokenizer.enable_padding(length=CONTENT_MAX_TEXT_CONTENT)
  content_out = tokenizer.encode_batch(batch['text_content'])
  
  tokenizer.enable_truncation(max_length=TITLE_MAX_TEXT_CONTENT)
  tokenizer.enable_padding(length=TITLE_MAX_TEXT_CONTENT)
  title_out = tokenizer.encode_batch(batch['title'])
  
  tokenized_batch['content_input_ids'] = torch.IntTensor(list(map(lambda x: x.ids, content_out)))
  tokenized_batch['title_input_ids'] = torch.IntTensor(list(map(lambda x: x.ids, title_out)))

  return tokenized_batch

In [15]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:19<00:00, 5020.72 examples/s]
Map: 100%|██████████| 35068/35068 [00:06<00:00, 5198.96 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 5346.15 examples/s]


In [16]:
tokenized_train_dataset

Dataset({
    features: ['tags_onehot', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days', 'title', 'text_content', 'labels', 'content_input_ids', 'title_input_ids'],
    num_rows: 98191
})

In [17]:
columns_to_remove = ['title', 'text_content']

tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(columns_to_remove)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

## Model - Custom

In [18]:
from custom_model import AutoCompositeModel, TagsModel, NumericalPartModel, TextualPartModel

model_tags = TagsModel(device).to(device)

model_numerical = NumericalPartModel(device).to(device)

model_textual = TextualPartModel(device).to(device)

## Training Setup

### Data Loaders

In [19]:
def collate_func(batch):
  fixed_batch = pd.DataFrame(batch).to_dict(orient="list")
  # print(fixed_batch)

  return fixed_batch

In [20]:
from torch.utils.data import DataLoader
training_loader = DataLoader(tokenized_train_dataset, batch_size=128, shuffle=True, collate_fn=collate_func)
validation_loader = DataLoader(tokenized_validation_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(tokenized_test_dataset, batch_size=128, shuffle=True)

In [21]:
print(tokenized_train_dataset['title_input_ids'][0])
print(len(tokenized_train_dataset['title_input_ids'][0]))

[20662, 9780, 3345, 3101, 5895, 3063, 71, 5895, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32


### Loss Function

In [22]:
loss_fn = torch.nn.CrossEntropyLoss()

### Optimizer

In [23]:
# Optimizers specified in the torch.optim package
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5, momentum=0.8)


optimizer_tags = torch.optim.Adam(model_tags.parameters(), lr=1e-5)
optimizer_numerical = torch.optim.Adam(model_numerical.parameters(), lr=1e-4)
optimizer_textual = torch.optim.Adam(model_textual.parameters(), lr=1e-3)

In [24]:
from torchmetrics import Accuracy

accuracy_metric = Accuracy(task='multiclass', num_classes=5).to(device)

### Trainer Setup

In [25]:
from training_own import Trainer, TrainerConfiguration, get_model_params

In [26]:
config_tags = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer_tags,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)
trainer_tags = Trainer(model=model_tags, trainer_configuration=config_tags, input_columns=['tags_onehot'], output_column='labels')

In [27]:
config_numerical = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer_tags,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)
trainer_numerical = Trainer(model=model_numerical, trainer_configuration=config_numerical, input_columns=['unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days'], output_column='labels')

In [28]:
config_textual = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer_textual,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)
trainer_textual = Trainer(model=model_textual, trainer_configuration=config_textual, input_columns=['content_input_ids', 'title_input_ids'], output_column='labels')

In [29]:
print(get_model_params(model_tags), get_model_params(model_numerical), get_model_params(model_textual))

9121 437 744809


In [None]:
# trainer_tags.train_many_epochs(epochs=2, logging_frequency=100, evaluate_when_logging=False)

In [31]:
trainer_numerical.train_many_epochs(epochs=4, logging_frequency=100, evaluate_when_logging=False)

EPOCH 1:
 batch 100 training_loss: 12.820892386436462 training_accuracy: 0.15843749046325684
 batch 200 training_loss: 12.41835602760315 training_accuracy: 0.1633593738079071
 batch 300 training_loss: 12.187669343948365 training_accuracy: 0.16554686427116394
 batch 400 training_loss: 11.869419836997986 training_accuracy: 0.1626562476158142
 batch 500 training_loss: 11.989241285324097 training_accuracy: 0.1633593738079071
 batch 600 training_loss: 12.124117994308472 training_accuracy: 0.1608593761920929
 batch 700 training_loss: 12.126450881958007 training_accuracy: 0.1635156273841858


  vlabels = torch.tensor(vdata[self.output_column]).long().to(self.config.device)


LOSS train 12.126450881958007 valid 12.15694808959961 ACCURACY train 0.16253662109375 validation 0.1639392375946045
EPOCH 2:
 batch 100 training_loss: 11.92748577594757 training_accuracy: 0.16914062201976776
 batch 200 training_loss: 12.173089513778686 training_accuracy: 0.16039061546325684
 batch 300 training_loss: 11.89269522190094 training_accuracy: 0.15632812678813934
 batch 400 training_loss: 12.449279479980468 training_accuracy: 0.1610156148672104
 batch 500 training_loss: 11.762155976295471 training_accuracy: 0.16499999165534973
 batch 600 training_loss: 12.420602140426636 training_accuracy: 0.16437499225139618
 batch 700 training_loss: 13.040624136924743 training_accuracy: 0.16445311903953552


  vlabels = torch.tensor(vdata[self.output_column]).long().to(self.config.device)


LOSS train 13.040624136924743 valid 12.157124519348145 ACCURACY train 0.16268989443778992 validation 0.16393832862377167
EPOCH 3:
 batch 100 training_loss: 11.851780519485473 training_accuracy: 0.162109375
 batch 200 training_loss: 11.997050428390503 training_accuracy: 0.16624999046325684


KeyboardInterrupt: 

In [None]:
from training_own import to_my_tensor
input = dict((k, to_my_tensor(test_dataset[k], device)) for k in ['unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days'])
res = model_numerical(input)
# test_dataset[0]

res[4]

In [None]:
# trainer_textual.train_many_epochs(epochs=2, logging_frequency=100, evaluate_when_logging=False)