## Loading Libraries

In [2]:
import pandas as pd
import numpy as np
import torch
import evaluate

In [3]:
device = torch.device('cuda')

## Loading Dataset

In [4]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [5]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [6]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [7]:
train_dataset, validation_dataset, test_dataset = random_split(full_dataset, [0.7, 0.25, 0.05])

train_dataset = Dataset.from_dict(train_dataset[:])
validation_dataset = Dataset.from_dict(validation_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [8]:
train_dataset = train_dataset.rename_columns({"text_content": "text", "status": "label"})
validation_dataset = validation_dataset.rename_columns({"text_content": "text", "status": "label"})
test_dataset = test_dataset.rename_columns({"text_content": "text", "status": "label"})

columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  'reputation',
  'undeleted_answers',
  'user_life_days',
  'title'
]

train_dataset = train_dataset.remove_columns(columns_to_remove)
validation_dataset = validation_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

## Experimenting With Different Model Architectures

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

### Setting Tokenizer

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
MAX_TEXT_CONTENT = 128

def tokenize_func(batch):
  tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=MAX_TEXT_CONTENT, return_tensors="pt")
  tokenized_batch["label"] = [label2id[label] for label in batch["label"]]
  return tokenized_batch

In [11]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map:   0%|          | 0/98191 [00:00<?, ? examples/s]

Map: 100%|██████████| 98191/98191 [00:14<00:00, 6962.42 examples/s]
Map: 100%|██████████| 35068/35068 [00:05<00:00, 6789.55 examples/s]
Map: 100%|██████████| 7013/7013 [00:01<00:00, 6404.38 examples/s]


In [12]:
tokenized_train_dataset.column_names

['text', 'label', 'input_ids', 'attention_mask']

### Preparing Metrics

In [13]:
accuracy_metric = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_metric.compute(predictions=predictions, references=labels)

## Testing Single Text Model

### Creating Models

In [15]:
# from transformers import AutoModelForSequenceClassification

# model_content = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(statuses), id2label=id2label, label2id=label2id)
# # model_content.to(device)

### Creating Trainer

In [16]:
# from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# training_args = TrainingArguments(
#   output_dir="text_content_model",
#   learning_rate=2e-5,
#   per_device_train_batch_size=16,
#   per_device_eval_batch_size=16,
#   num_train_epochs=5,
#   weight_decay=0.01,
#   evaluation_strategy="epoch",
#   save_strategy="epoch",
#   load_best_model_at_end=True,
# )

# trainer = Trainer(
#   model=model_content,
#   args=training_args,
#   train_dataset=tokenized_train_dataset,
#   eval_dataset=tokenized_test_dataset,
#   tokenizer=tokenizer,
#   data_collator=data_collator,
#   compute_metrics=compute_metrics,
# )

In [17]:
# from transformers import TrainerCallback
# from copy import deepcopy

# class CustomCallback(TrainerCallback):
  
#   def __init__(self, trainer) -> None:
#     super().__init__()
#     self._trainer = trainer
  
#   def on_epoch_end(self, args, state, control, **kwargs):
#     if control.should_evaluate:
#       control_copy = deepcopy(control)
#       self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
#       return control_copy

# trainer.add_callback(CustomCallback(trainer)) 

In [18]:
# trainer.train('./text_content_model/checkpoint-6576')

## Model - Custom

### Creating Model

In [19]:
from custom_model import AutoCompositeModel 
model = AutoCompositeModel(device).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
tokenized_test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 7013
})

In [23]:
# trainer.predict(tokenized_test_dataset[0])
inputs = torch.tensor([tokenized_test_dataset['input_ids'][0]]).to(device)
output = model(inputs)
output
# model_content(tokenized_test_dataset['input_ids'])

tensor([[ 0.0384,  0.0590, -0.0705, -0.0922, -0.1549, -0.0281, -0.0006, -0.0087,
         -0.1026, -0.0106, -0.1685,  0.0502,  0.0094,  0.0354,  0.0776, -0.0419,
          0.0198, -0.0838, -0.0385,  0.0493, -0.0382,  0.1712,  0.0100,  0.2353,
         -0.0014, -0.0117,  0.0043, -0.0644, -0.0830, -0.0382,  0.0674, -0.0242,
          0.0438,  0.0072,  0.0861, -0.0032, -0.0548,  0.0469, -0.0041, -0.0278,
         -0.0895, -0.0168, -0.0706, -0.0134,  0.0678,  0.0479,  0.0430,  0.0545,
         -0.0979,  0.0881,  0.0892, -0.1212, -0.0217, -0.0655, -0.0575, -0.1099,
          0.0451, -0.0932, -0.0943,  0.1754,  0.0437, -0.0225,  0.1049,  0.0773]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[ 0.1834, -0.0292, -0.1803,  0.1453, -0.0704]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


tensor([[ 0.1834, -0.0292, -0.1803,  0.1453, -0.0704]], device='cuda:0',
       grad_fn=<AddmmBackward0>)