## Loading Libraries

In [13]:
import pandas as pd
import numpy as np
import torch
import evaluate

## Loading Dataset

In [14]:
df = pd.read_csv('../data/processed/train_sample_processed.csv')

In [15]:
statuses = np.unique(df['OpenStatus'].values)

id2label = []
label2id = []

for i, val in enumerate(statuses):
  id2label.append({i: val})
  label2id.append({val: i})

In [16]:
from custom_dataset import GithubDataset

# Create the train dataset
full_dataset = GithubDataset(df)
full_dataset

<custom_dataset.GithubDataset at 0x226960ad5d0>

#### Splitting Dataset to Train and Evaluation

In [23]:
from torch.utils.data import random_split

TRAIN_DATA_SIZE = int(len(full_dataset) * 0.75)
TEST_DATA_SIZE = int(len(full_dataset) * 0.25)
train_dataset, test_dataset = random_split(full_dataset, [TRAIN_DATA_SIZE, TEST_DATA_SIZE])

## Experimenting With Different Model Architectures

#### COMMENT - IDEAS
We probably should:
- retrain the whole model (probably smaller) with
- better tokenizer - built up from the ground including all the names of the specific tech (languages, frameworks, IDEs, etc.)

### Setting Tokenizer

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

### Preparing Metrics

In [8]:
accuracy_metric = evaluate.load("accuracy")

In [9]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_metric.compute(predictions=predictions, references=labels)

### Creating Models

In [10]:
# from transformers import DistilBertModel

# model_title = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [11]:
from transformers import DistilBertModel

model_content = DistilBertModel.from_pretrained("distilbert-base-uncased")

### Creating Trainer

In [12]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
  output_dir="my_awesome_model",
  learning_rate=2e-5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  num_train_epochs=2,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
  push_to_hub=True,
)

# trainer = Trainer(
#   model=model,
#   args=training_args,
#   train_dataset=tokenized_data["train"],
#   eval_dataset=tokenized_data["test"],
#   tokenizer=tokenizer,
#   data_collator=data_collator,
#   compute_metrics=compute_metrics,
# )

# trainer.train()