## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda')

## Loading Dataset

In [3]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [4]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [5]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [6]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [7]:
train_dataset = train_dataset.rename_columns({"text_content": "text", "status": "label"})
validation_dataset = validation_dataset.rename_columns({"text_content": "text", "status": "label"})
test_dataset = test_dataset.rename_columns({"text_content": "text", "status": "label"})

columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  'reputation',
  'undeleted_answers',
  'user_life_days',
  'title'
]

train_dataset = train_dataset.remove_columns(columns_to_remove)
validation_dataset = validation_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

## Experimenting With Different Model Architectures

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

### Setting Tokenizer

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [9]:
MAX_TEXT_CONTENT = 128

def tokenize_func(batch):
  tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch["labels"] = [label2id[label] for label in batch["label"]]
  return tokenized_batch

In [10]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

columns_to_remove_tokenized = ['label', 'text']

tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove_tokenized)
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(columns_to_remove_tokenized)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove_tokenized)

Map: 100%|██████████| 98191/98191 [00:14<00:00, 6664.21 examples/s]
Map: 100%|██████████| 35068/35068 [00:05<00:00, 6624.60 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 6835.16 examples/s]


In [11]:
tokenized_train_dataset.column_names

['input_ids', 'attention_mask', 'labels']

In [12]:
# tokenized_train_dataset['label']

### Preparing Metrics

In [13]:
accuracy_metric = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
  print(eval_pred)
  
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_metric.compute(predictions=predictions, references=labels)

## Model - Custom

### Creating Model

In [15]:
from custom_model import AutoCompositeModel

model = AutoCompositeModel(device).to(device)

In [16]:
model.my_new_layers[0].weight.dtype

torch.float32

In [17]:
input = torch.tensor(tokenized_test_dataset['input_ids'][:10]).float().to(device)
input.dtype

torch.float32