# CAPP 30255 Final Project
## BERT Model
### Piper Kurtz, Wesley Janson, Sam Pavlekovsky

In [33]:
#Installations in case not there
! pip install transformers datasets evaluate
! pip uninstall -y transformers accelerate
! pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Found existing installation: transformers 4.29.2
Uninstalling transformers-4.29.2:
  Successfully uninstalled transformers-4.29.2
Found existing installation: accelerate 0.19.0
Uninstalling accelerate-0.19.0:
  Successfully uninstalled accelerate-0.19.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Using cached transformers-4.29.2-py3-none-any.whl (7.1 MB)
Collecting accelerate
  Using cached accelerate-0.19.0-py3-none-any.whl (219 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.19.0 transformers-4.29.2


In [34]:
from transformers import AutoTokenizer # For BERT base
import pandas as pd
from datasets import Dataset # For creating Huggingface style object
import numpy as np
from transformers import DataCollatorWithPadding # For DataCollator
import evaluate # Accuracy metrics
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer # For Training etc.

In [None]:
# IF running on Colab (Wesley)
from google.colab import drive 
drive.mount('/content/gdrive')
PATH = "gdrive/MyDrive/Colab Notebooks/"

In [35]:
#Download BERT base
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [37]:
def load_data(csv):
  # Reads the raw csv file and split into
  # sentences (x) and target (y)
  df = pd.read_csv(csv)  
  return df

data = load_data(f'{PATH}/data.csv')
data=data.rename(columns = {'type':'labels'})
data = data[0:100000]
print(data)

                                                     url  \
0      https://en.wikipedia.org/wiki/Sexhow%20railway...   
1            https://en.wikipedia.org/wiki/Eti%C3%A4inen   
2      https://en.wikipedia.org/wiki/Inverse%20functi...   
3      https://en.wikipedia.org/wiki/Stepping%20on%20...   
4            https://en.wikipedia.org/wiki/Rob%20Bradley   
...                                                  ...   
99995         https://en.wikipedia.org/wiki/Hero%20Squad   
99996  https://en.wikipedia.org/wiki/The%20Regulators...   
99997  https://en.wikipedia.org/wiki/Caledonian%20Estate   
99998  https://en.wikipedia.org/wiki/Course%20%28orie...   
99999  https://en.wikipedia.org/wiki/Table%20Bluff%20...   

                          title  \
0        Sexhow railway station   
1                      Etiäinen   
2      Inverse function theorem   
3             Stepping on Roses   
4                   Rob Bradley   
...                         ...   
99995                Hero Squad   

In [38]:
#Tokenize inputs
def preprocess_function(examples):
    return tokenizer(examples["intro"], truncation=True)

In [39]:
#Create huggingface dataset style object
train, validate, test = \
              np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
tokenized_df_train = train.map(preprocess_function, batched=True)
tokenized_df_test = test.map(preprocess_function, batched=True)

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [40]:
#Create data collator for feeding into model
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [41]:
#Create generalized accuracy metric from huggingface
accuracy = evaluate.load("accuracy")

In [42]:
#Use general accuracy function to create specific accuracy metric comparing output to true labell

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [43]:
#Establish label dictionaries 
id2label = {0: "HUMAN", 1: "MACHINE"}
label2id = {"HUMAN": 0, "MACHINE": 1}

In [44]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
training_args = TrainingArguments(
    output_dir="output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
   # label_names = "labels",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df_train,
    eval_dataset=tokenized_df_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_output = trainer.train()
train_output

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
5,0.0,0.0,1.0


TrainOutput(global_step=18750, training_loss=0.0003048582391367821, metrics={'train_runtime': 8465.1912, 'train_samples_per_second': 35.439, 'train_steps_per_second': 2.215, 'total_flos': 2.321696485914317e+16, 'train_loss': 0.0003048582391367821, 'epoch': 5.0})

In [45]:
# get the training and validation loss and accuracy values from the trainer object
train_losses = trainer.history['train_loss']
train_accs = trainer.history['train_acc']
val_losses = trainer.history['eval_loss']
val_accs = trainer.history['eval_acc']

# create a TrainingArguments object and use it to plot the metrics
training_args = TrainingArguments(output_dir='./results')
training_args.plotting_args = {'title': 'Training and Validation loss/accuracy'}

TrainingArguments.plot(
    [train_losses, val_losses],
    xaxis="epoch",
    yaxis="loss",
    metric_name="Loss",
)

TrainingArguments.plot(
    [train_accs, val_accs],
    xaxis="epoch",
    yaxis="accuracy",
    metric_name="Accuracy",
)

In [None]:
train_output['metrics']