# Machine Learning Project - Inappropriate Language Classification - DistilBert

The goal here is to base the classigication model off of Distil-BERT, using Hugging Face's library to train the model.

## 1. Data processing

1. Get the data
2. Tockenize the data
3. Make datasets compatible with hugging face

In [None]:
from experiment_baseplate import load_split_data

X_train, y_train, X_validate, y_validate, X_test, y_test = load_split_data()

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
X_train = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_validate = tokenizer(X_validate.tolist(), truncation=True, padding=True)
X_test = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [None]:
import torch

class FoulDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
import numpy as np

train_dataset = FoulDataset(X_train, np.argmax(y_train, axis=1) )
val_dataset = FoulDataset(X_validate, np.argmax(y_validate, axis=1) )
test_dataset = FoulDataset(X_test, np.argmax(y_test, axis=1) )

## 2. Make model

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

#Get pretrained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

## 3. Train model

In [None]:
#Get Training Arguments Object (Hyperparameters)
training_args = TrainingArguments(
    output_dir="bert-out"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()