## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda')

## Loading Dataset

In [3]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [4]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [5]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [6]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [7]:
# renaming_dict = {"text_content": "text", "status": "label"}
renaming_dict = {"status": "labels"}

train_dataset = train_dataset.rename_columns(renaming_dict)
validation_dataset = validation_dataset.rename_columns(renaming_dict)
test_dataset = test_dataset.rename_columns(renaming_dict)

### Tokenizing Data

In [8]:
from transformers import AutoTokenizer

# content_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# titles_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [9]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
# import en_core_web_sm
# tokenizer = en_core_web_sm.load()

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
train_dataset

Dataset({
    features: ['tags_onehot', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days', 'title', 'text_content', 'labels'],
    num_rows: 98191
})

In [11]:
label2id
[label2id[label] for label in train_dataset['labels'][:10]]

[0, 2, 0, 3, 2, 3, 0, 3, 2, 1]

In [12]:
CONTENT_MAX_TEXT_CONTENT = 128
TITLE_MAX_TEXT_CONTENT = 32

def tokenize_func(batch):
  tokenized_batch = dict()
  
  tokenized_temp = tokenizer(batch['text_content'], padding=True, truncation=True, max_length=CONTENT_MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch['content_input_ids'] = tokenized_temp['input_ids']
  tokenized_batch['content_attention_mask'] = tokenized_temp['attention_mask']
  
  tokenized_temp = tokenizer(batch['title'], padding=True, truncation=True, max_length=TITLE_MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch['title_input_ids'] = tokenized_temp['input_ids']
  tokenized_batch['title_attention_mask'] = tokenized_temp['attention_mask']
  
  return tokenized_batch

In [13]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:21<00:00, 4572.86 examples/s]


In [14]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:19<00:00, 4979.31 examples/s]
Map: 100%|██████████| 35068/35068 [00:06<00:00, 5228.39 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 4897.40 examples/s]


In [15]:
len(tokenized_train_dataset['title_input_ids'][11])


31

In [16]:
tokenized_test_dataset
# tokenized_test_dataset['user_life_days']

Dataset({
    features: ['tags_onehot', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days', 'title', 'text_content', 'labels', 'content_input_ids', 'content_attention_mask', 'title_input_ids', 'title_attention_mask'],
    num_rows: 7013
})

In [17]:
columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  # 'reputation',
  # 'undeleted_answers',
  # 'user_life_days',
  'title',
  'text_content',
]

tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(columns_to_remove)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

## Model - Custom

In [18]:
from custom_model import AutoCompositeModel

model = AutoCompositeModel(device).to(device)

## Training Setup

### Data Loaders

In [19]:
important_features = [
  'content_input_ids',
  'title_input_ids',
  'labels',
]

def collate_func(batch):
  # test = 0
  # print(batch)
  
  # all = batch[:]
  # test = [all[feature] for feature in important_features]
  print(batch[0]['title_input_ids'])
  print(batch[1]['title_input_ids'])
  
  print(len(batch[0]['title_input_ids']))
  print(len(batch[1]['title_input_ids']))
  return batch

In [20]:
from torch.utils.data import DataLoader
training_loader = DataLoader(tokenized_train_dataset, batch_size=16, shuffle=True, collate_fn=collate_func)
validation_loader = DataLoader(tokenized_validation_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_test_dataset, batch_size=16, shuffle=True)

In [21]:
for i, data in enumerate(training_loader):
  if i >= 0:
    break
    
  # print(data[1]['title_input_ids'])
  # print(len(data[1]['title_input_ids']))

[101, 2129, 2079, 1045, 4372, 16044, 1998, 21933, 3207, 1037, 2918, 21084, 5164, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2129, 2000, 3573, 1996, 2765, 1997, 1996, 5227, 1999, 2795, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32
32


In [22]:
print(tokenized_train_dataset['title_input_ids'][0])
print(len(tokenized_train_dataset['title_input_ids'][0]))

[101, 2358, 2094, 1024, 1024, 4066, 2006, 2019, 2358, 2094, 1024, 1024, 9207, 22963, 2021, 2196, 20736, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
31


In [23]:
# for i, data in enumerate(training_loader):
#   if i < 1:
#     print(data)

### Loss Function

In [24]:
loss_fn = torch.nn.CrossEntropyLoss()

### Optimizer

In [25]:
# Optimizers specified in the torch.optim package
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5, momentum=0.8)

In [26]:
from torchmetrics import Accuracy

accuracy_metric = Accuracy(task='multiclass', num_classes=5).to(device)

### Trainer Setup

In [27]:
from training_own import Trainer, TrainerConfiguration, get_model_params

config = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)

In [28]:
get_model_params(model)

692933

In [29]:
trainer = Trainer(model=model, trainer_configuration=config, input_columns=['content_input_ids', 'title_input_ids', 'reputation', 'undeleted_answers', 'user_life_days'], output_column='labels')

In [30]:
len(training_loader)

6137

In [31]:
trainer.train_many_epochs(epochs=8, logging_frequency=100, evaluate_when_logging=False)

EPOCH 1:
[101, 2129, 2000, 4060, 1037, 7809, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 11924, 9130, 1012, 1012, 2129, 2000, 2131, 3229, 18715, 2368, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32
32
column_name content_input_ids
inputs[column_name] [tensor([  101,  1999,  2023,  2154,  1998,  2287,  1010,  2007,  1037,  2047,
        16839,  4160,  2140, 17881, 20095,  2039,  2296,  5353,  1012,  1012,
         2024,  2045,  2151,  2204,  1998,  2039,  1011,  2000,  1011,  3058,
         4219,  2006,  2129,  2000,  4060,  1037,  7809,  1006,  1055,  1007,
         2008,  2097,  4848,  2115,  4773,  4646,  3791,  1029,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,  

TypeError: len() of a 0-d tensor