## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda')

In [3]:
x = torch.randn(30)
y = torch.randn(30)

z = torch.cat([x.reshape(-1,1), y.reshape(-1,1)], 1)
print(x.shape, y.shape, z.shape)

torch.Size([30]) torch.Size([30]) torch.Size([30, 2])


## Loading Dataset

In [4]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [5]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [6]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [7]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [8]:
# renaming_dict = {"text_content": "text", "status": "label"}
renaming_dict = {"status": "labels"}

train_dataset = train_dataset.rename_columns(renaming_dict)
validation_dataset = validation_dataset.rename_columns(renaming_dict)
test_dataset = test_dataset.rename_columns(renaming_dict)

### Tokenizing Data

In [9]:
from transformers import AutoTokenizer

# content_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# titles_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
# import en_core_web_sm
# tokenizer = en_core_web_sm.load()

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [11]:
train_dataset

Dataset({
    features: ['tags_onehot', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days', 'title', 'text_content', 'labels'],
    num_rows: 98191
})

In [12]:
label2id
[label2id[label] for label in train_dataset['labels'][:10]]

[0, 0, 3, 0, 3, 2, 0, 3, 0, 3]

In [13]:
CONTENT_MAX_TEXT_CONTENT = 128
TITLE_MAX_TEXT_CONTENT = 32

def tokenize_func(batch):
  tokenized_batch = dict()
  
  tokenized_temp = tokenizer(batch['text_content'], padding=True, truncation=True, return_length= max_length=CONTENT_MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch['content_input_ids'] = tokenized_temp['input_ids']
  tokenized_batch['content_attention_mask'] = tokenized_temp['attention_mask']
  
  tokenized_temp = tokenizer(batch['title'], padding=True, truncation=True, max_length=TITLE_MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch['title_input_ids'] = tokenized_temp['input_ids']
  tokenized_batch['title_attention_mask'] = tokenized_temp['attention_mask']
  
  tokenized_batch['labels'] = [label2id[label] for label in batch['labels']]
  
  return tokenized_batch

In [14]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:18<00:00, 5428.61 examples/s]


In [15]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 98191/98191 [00:18<00:00, 5385.28 examples/s]
Map: 100%|██████████| 35068/35068 [00:06<00:00, 5452.39 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 5051.04 examples/s]


In [16]:
tokenized_test_dataset['title_input_ids'][131]

[101,
 2339,
 2079,
 7513,
 2404,
 1996,
 1523,
 5310,
 2064,
 25416,
 8630,
 3645,
 2030,
 2972,
 2291,
 2006,
 2029,
 3645,
 2003,
 5361,
 1524,
 11075,
 2006,
 2037,
 3645,
 1021,
 7327,
 2721,
 1029,
 102,
 0,
 0]

In [17]:
columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  # 'reputation',
  # 'undeleted_answers',
  # 'user_life_days',
  'title',
  'text_content',
]

tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(columns_to_remove)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

## Model - Custom

In [18]:
from custom_model import AutoCompositeModel

model = AutoCompositeModel(device).to(device)

## Training Setup

### Data Loaders

In [19]:
important_features = [
  'content_input_ids',
  'title_input_ids',
  'labels',
]

def collate_func(batch):
  fixed_batch = pd.DataFrame(batch).to_dict(orient="list")
  # print(fixed_batch)

  return fixed_batch

In [20]:
from torch.utils.data import DataLoader
training_loader = DataLoader(tokenized_train_dataset, batch_size=16, shuffle=True, collate_fn=collate_func)
validation_loader = DataLoader(tokenized_validation_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_test_dataset, batch_size=16, shuffle=True)

In [21]:
print(tokenized_train_dataset['title_input_ids'][0])
print(len(tokenized_train_dataset['title_input_ids'][0]))

[101, 16401, 9262, 22483, 3853, 2000, 1046, 4226, 2854, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32


### Loss Function

In [22]:
loss_fn = torch.nn.CrossEntropyLoss()

### Optimizer

In [23]:
# Optimizers specified in the torch.optim package
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5, momentum=0.8)

In [24]:
from torchmetrics import Accuracy

accuracy_metric = Accuracy(task='multiclass', num_classes=5).to(device)

### Trainer Setup

In [25]:
from training_own import Trainer, TrainerConfiguration, get_model_params

config = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)

In [26]:
get_model_params(model)

1364773

In [27]:
trainer = Trainer(model=model, trainer_configuration=config, input_columns=['content_input_ids', 'title_input_ids', 'reputation', 'undeleted_answers', 'user_life_days'], output_column='labels')

In [28]:
len(training_loader)

6137

In [29]:
trainer.train_one_epoch(logging_frequency=100, evaluate_when_logging=False)

ValueError: expected sequence of length 32 at dim 1 (got 27)

In [None]:

# for i, data in enumerate(training_loader):
#   if i > 0:
#     break
    
#   subset = dict((k, data[k]) for k in ('content_input_ids', 'title_input_ids', 'reputation', 'undeleted_answers', 'user_life_days'))
#   print(subset)
  # text_content = torch.tensor(data['content_input_ids'])
  # print(text_content[0])
  # print(type(text_content[0]))
  
  

In [None]:
trainer.train_many_epochs(epochs=8, logging_frequency=1000, evaluate_when_logging=False)