## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Loading Dataset

In [2]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [3]:
statuses = np.unique(df['OpenStatus'].values)

id2label = {idx: label for idx, label in enumerate(statuses)}
label2id = {label: idx for idx, label in enumerate(statuses)}

In [4]:

from custom_dataset import GithubDataset
from torch.utils.data import random_split
from datasets import Dataset

# Create the pytorch dataset
full_dataset = GithubDataset(df)

In [5]:

# Performing train-test split
TRAIN_DATA_SIZE = int(len(full_dataset) * 0.75)
TEST_DATA_SIZE = int(len(full_dataset) * 0.25)
train_dataset, test_dataset = random_split(full_dataset, [TRAIN_DATA_SIZE, TEST_DATA_SIZE])

train_dataset = Dataset.from_dict(train_dataset[:])
test_dataset = Dataset.from_dict(test_dataset[:])

In [6]:
train_dataset = train_dataset.rename_columns({"text_content": "text", "status": "label"})
test_dataset = test_dataset.rename_columns({"text_content": "text", "status": "label"})

columns_to_remove = [
  'tags_onehot',
  'unrecognized_tags_count',
  'reputation',
  'undeleted_answers',
  'user_life_days',
  'title'
]

train_dataset = train_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

## Experimenting With Different Model Architectures

#### COMMENT - IDEAS

We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

### Setting Tokenizer

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
MAX_TEXT_CONTENT = 128

def tokenize_func(batch):
  tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=MAX_TEXT_CONTENT)
  tokenized_batch["label"] = [label2id[label] for label in batch["label"]]
  return tokenized_batch

In [9]:
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)

Map: 100%|██████████| 105204/105204 [00:14<00:00, 7306.20 examples/s]
Map: 100%|██████████| 35068/35068 [00:04<00:00, 7238.76 examples/s]


In [10]:
tokenized_train_dataset.column_names

['text', 'label', 'input_ids', 'attention_mask']

### Preparing Metrics

In [11]:
accuracy_metric = evaluate.load("accuracy")

In [12]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_metric.compute(predictions=predictions, references=labels)

### Creating Models

In [13]:
from transformers import AutoModelForSequenceClassification

model_content = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(statuses), id2label=id2label, label2id=label2id)
# model_content.to('cuda')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Creating Trainer

In [14]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
  output_dir="text_content_model",
  learning_rate=2e-5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  num_train_epochs=1,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
)

trainer = Trainer(
  model=model_content,
  args=training_args,
  train_dataset=tokenized_train_dataset,
  eval_dataset=tokenized_test_dataset,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/6576 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  8%|▊         | 501/6576 [01:04<12:57,  7.82it/s]

{'loss': 1.1004, 'learning_rate': 1.8479318734793188e-05, 'epoch': 0.08}


 15%|█▌        | 1001/6576 [02:11<12:44,  7.29it/s]

{'loss': 0.9835, 'learning_rate': 1.6958637469586377e-05, 'epoch': 0.15}


 23%|██▎       | 1501/6576 [03:22<11:57,  7.07it/s]

{'loss': 0.9489, 'learning_rate': 1.5437956204379563e-05, 'epoch': 0.23}


 30%|███       | 2001/6576 [04:34<10:48,  7.05it/s]

{'loss': 0.9539, 'learning_rate': 1.3917274939172751e-05, 'epoch': 0.3}


 38%|███▊      | 2501/6576 [05:44<09:40,  7.02it/s]

{'loss': 0.93, 'learning_rate': 1.2396593673965937e-05, 'epoch': 0.38}


 46%|████▌     | 3001/6576 [06:55<08:45,  6.80it/s]

{'loss': 0.9157, 'learning_rate': 1.0875912408759123e-05, 'epoch': 0.46}


 53%|█████▎    | 3501/6576 [08:07<07:16,  7.05it/s]

{'loss': 0.8966, 'learning_rate': 9.355231143552313e-06, 'epoch': 0.53}


 61%|██████    | 4001/6576 [09:16<05:43,  7.50it/s]

{'loss': 0.9123, 'learning_rate': 7.8345498783455e-06, 'epoch': 0.61}


 68%|██████▊   | 4501/6576 [10:23<04:30,  7.67it/s]

{'loss': 0.8954, 'learning_rate': 6.313868613138686e-06, 'epoch': 0.68}


 76%|███████▌  | 5001/6576 [11:31<03:37,  7.25it/s]

{'loss': 0.8989, 'learning_rate': 4.793187347931874e-06, 'epoch': 0.76}


 84%|████████▎ | 5501/6576 [12:38<02:21,  7.62it/s]

{'loss': 0.8885, 'learning_rate': 3.272506082725061e-06, 'epoch': 0.84}


 91%|█████████▏| 6001/6576 [13:45<01:18,  7.37it/s]

{'loss': 0.8818, 'learning_rate': 1.7518248175182485e-06, 'epoch': 0.91}


 99%|█████████▉| 6501/6576 [14:54<00:11,  6.65it/s]

{'loss': 0.8747, 'learning_rate': 2.3114355231143555e-07, 'epoch': 0.99}


                                                   
100%|██████████| 6576/6576 [16:44<00:00,  6.61it/s]

{'eval_loss': 0.874190092086792, 'eval_accuracy': 0.6687863579331584, 'eval_runtime': 99.2772, 'eval_samples_per_second': 353.233, 'eval_steps_per_second': 22.08, 'epoch': 1.0}


100%|██████████| 6576/6576 [16:45<00:00,  6.54it/s]

{'train_runtime': 1005.6066, 'train_samples_per_second': 104.617, 'train_steps_per_second': 6.539, 'train_loss': 0.9294927717705422, 'epoch': 1.0}





TrainOutput(global_step=6576, training_loss=0.9294927717705422, metrics={'train_runtime': 1005.6066, 'train_samples_per_second': 104.617, 'train_steps_per_second': 6.539, 'train_loss': 0.9294927717705422, 'epoch': 1.0})