## Loading Libraries

In [154]:
import pandas as pd
import numpy as np
import torch
import evaluate

In [155]:
device = torch.device('cuda')

## Loading Dataset

In [156]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [157]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [158]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [159]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [160]:
# renaming_dict = {"text_content": "text", "status": "label"}
renaming_dict = {"status": "labels"}

train_dataset = train_dataset.rename_columns(renaming_dict)
validation_dataset = validation_dataset.rename_columns(renaming_dict)
test_dataset = test_dataset.rename_columns(renaming_dict)

### Tokenizing Data

In [161]:
# from transformers import AutoTokenizer

# content_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# titles_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [167]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
tokenizer = en_core_web_sm.load()

In [202]:
# CONTENT_MAX_TEXT_CONTENT = 128
# TITLE_MAX_TEXT_CONTENT = 32

# def tokenize_func(batch):
#   tokenized_batch = batch
#   tokenized_batch['content_input_ids'] = tokenizer(batch['text_content'], padding=True, truncation=True, max_length=CONTENT_MAX_TEXT_CONTENT, return_tensors="pt")['input_ids']
  
#   tokenized_batch['title_input_ids'] = tokenizer(batch['title'], padding=True, truncation=True, max_length=TITLE_MAX_TEXT_CONTENT, return_tensors="pt")['input_ids']
#   tokenized_batch["labels"] = [label2id[label] for label in batch["labels"]]
#   return tokenized_batch

In [214]:
CONTENT_MAX_TEXT_CONTENT = 128
TITLE_MAX_TEXT_CONTENT = 32

def tokenize_func(elem):
  tokenized_batch = dict()
  
  tokenized_batch['content_input_ids'] = tokenizer(elem['text_content'])
  tokenized_batch['title_input_ids'] = tokenizer(elem['title'])
    
  # tokenized_batch['content_input_ids'] = tokenizer(batch['text_content'], padding=True, truncation=True, max_length=CONTENT_MAX_TEXT_CONTENT, return_tensors="pt")['input_ids']
  
  # tokenized_batch['title_input_ids'] = tokenizer(batch['title'], padding=True, truncation=True, max_length=TITLE_MAX_TEXT_CONTENT, return_tensors="pt")['input_ids']
  
  tokenized_batch["labels"] = label2id[elem['labels']]
  return tokenized_batch

In [215]:
tokenized_train_dataset = train_dataset.map(tokenize_func)

Map:   0%|          | 250/98191 [00:13<1:29:06, 18.32 examples/s]


ArrowInvalid: Could not convert Hey guys, I'm trying to make a 2D Platform style game similar to this game below:

http://www.gameshed.com/Puzzle-Games/Blockdude/play.html

I have finished making most of the graphic, and areas, and collision, but our character is still not able to carry things. I'm confused as to what code to use so that my character can carry the blocks. I need help as to how to make our character carry blocks that are in front of him, provided that the blocks that don't have anything on top of it. This has been confusing me for a week now, and any help would be highly appreciated. :D with type spacy.tokens.doc.Doc: did not recognize Python value type when inferring an Arrow data type

In [81]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map:   0%|          | 0/98191 [00:00<?, ? examples/s]

Map: 100%|██████████| 98191/98191 [00:17<00:00, 5724.80 examples/s]
Map: 100%|██████████| 35068/35068 [00:05<00:00, 5967.95 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 5960.71 examples/s]


In [82]:
len(tokenized_train_dataset['title_input_ids'][11])


43

In [83]:
tokenized_test_dataset
# tokenized_test_dataset['user_life_days']

Dataset({
    features: ['tags_onehot', 'unrecognized_tags_count', 'reputation', 'undeleted_answers', 'user_life_days', 'title', 'text_content', 'labels', 'content_input_ids', 'title_input_ids'],
    num_rows: 7013
})

In [84]:
columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  # 'reputation',
  # 'undeleted_answers',
  # 'user_life_days',
  'title',
  'text_content',
]

tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(columns_to_remove)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

## Model - Custom

In [85]:
from custom_model import AutoCompositeModel

model = AutoCompositeModel(device).to(device)

## Training Setup

### Data Loaders

In [120]:
important_features = [
  'content_input_ids',
  'title_input_ids',
  'labels',
]

def collate_func(batch):
  # test = 0
  # print(batch)
  
  # all = batch[:]
  # test = [all[feature] for feature in important_features]
  print(batch[0]['title_input_ids'])
  print(batch[1]['title_input_ids'])
  
  print(len(batch[0]['title_input_ids']))
  print(len(batch[1]['title_input_ids']))
  return batch

In [121]:
from torch.utils.data import DataLoader
training_loader = DataLoader(tokenized_train_dataset, batch_size=16, shuffle=True, collate_fn=collate_func)
validation_loader = DataLoader(tokenized_validation_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_test_dataset, batch_size=16, shuffle=True)

In [122]:
for i, data in enumerate(training_loader):
  if i >= 0:
    break
    
  # print(data[1]['title_input_ids'])
  # print(len(data[1]['title_input_ids']))

[101, 1039, 1001, 1012, 5658, 1006, 18847, 1007, 5443, 9262, 5443, 18106, 2250, 1006, 2004, 2509, 1007, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 16770, 2128, 15549, 3064, 2005, 1042, 2497, 3931, 21628, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
41
58


In [93]:
print(tokenized_train_dataset['title_input_ids'][0])
print(len(tokenized_train_dataset['title_input_ids'][0]))

[101, 6575, 1999, 2004, 2361, 4438, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
43


In [17]:
# for i, data in enumerate(training_loader):
#   if i < 1:
#     print(data)

### Loss Function

In [18]:
loss_fn = torch.nn.CrossEntropyLoss()

### Optimizer

In [19]:
# Optimizers specified in the torch.optim package
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5, momentum=0.8)

In [20]:
from torchmetrics import Accuracy

accuracy_metric = Accuracy(task='multiclass', num_classes=5).to(device)

### Trainer Setup

In [21]:
from training_own import Trainer, TrainerConfiguration, get_model_params

config = TrainerConfiguration(
  training_loader=training_loader,
  validation_loader=validation_loader,
  optimizer=optimizer,
  loss_fn=loss_fn,
  accuracy_metric=accuracy_metric,
  device=device
)

In [22]:
get_model_params(model)

692933

In [23]:
trainer = Trainer(model=model, trainer_configuration=config, input_columns=['content_input_ids', 'title_input_ids', 'reputation', 'undeleted_answers', 'user_life_days'], output_column='labels')

In [24]:
len(training_loader)

6137

In [25]:
trainer.train_many_epochs(epochs=8, logging_frequency=100, evaluate_when_logging=False)

EPOCH 1:
column_name content_input_ids
inputs[column_name] [tensor([  101,  1045,  1005,  1049,  2667,  2000,  2131,  1996,  5798,  3200,
         2013, 19557, 14141,  8303,  8654,  2006,  2026, 18059,  1012,  1045,
         1005,  2310,  2246,  2083,  2070, 10287,  2058,  1996,  4773,  1998,
         3262, 26021,  1996,  2168,  3437,  1010,  2029,  1045,  2031,  2699,
         2870,  2004,  2917,  1012,  2021,  2023,  2145,  2987,  1005,  1056,
         2147,  2005,  2033,  2061,  1045,  4687,  2065,  1045,  4771,  2242,
         2842,  1012,  1012,  1012,  1012,  2154, 14192, 20097,  1027,  1031,
         1031,  1031, 24978, 13701, 14192, 20097,  2035, 10085,  1033,  1999,
         4183,  1033,  8285, 16570, 19500,  1033,  1025,  1031,  2969,  1012,
         2154, 14192, 20097,  2275, 13701, 14192,  4017,  1024,  1030,  1000,
        25391,  2213,  1040,  1000,  1033,  1025, 24978,  3367,  4892,  1008,
         2651,  1027,  1031,  2154, 14192, 20097,  5164, 19699,  5358, 13701,
    

RuntimeError: stack expects each tensor to be equal size, but got [49] at entry 0 and [56] at entry 2