# Connect Notebook to Google Drive


In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/gdrive')
root_path = "/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Install/Import packages 

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 27.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 56.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalli

In [6]:
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd

# Read in Dataset

In [7]:
df_raw = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/raw/okcupid_profiles.csv")
df_raw.head(5)
df = df_raw[["sex", "essay0"]]

In [8]:
df_raw.shape #59946, 31)
#df.groupby(["sex"]).size().plot.bar()

(59946, 31)

In [9]:
df_raw.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,


# Feature Extraction: Get Word Embedding Vector with BERT 

Tutorial: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

https://huggingface.co/docs/transformers/tasks/sequence_classification

BERT input Variables: 
* input_ids: id representation of each token (When decoded: "[CLS] text [SEP] [PAD]..."
* token_typ_ids: Binary mask that identifies in which sequence a token belongs, for a single sequence all token type ids are 0
* attention_mask: Binary mask that identifies whether a token is a real word or just padding




## Preprocess Data

In [None]:
! pip install datasets
from datasets import load_dataset
imdb = load_dataset("imdb")

In [None]:
#Filter out NAs of essay0 (about me in profile text)
df = df.dropna(subset =  ["essay0"])
len(df["essay0"]) #54458, before: 59946

In [27]:
df.head(2)

Unnamed: 0,sex,essay0,female,label
0,m,about me: i would love to think that i was so...,0,0
1,m,i am a chef: this is what that means. 1. i am ...,0,0


In [12]:
#make sex a binary variable 
df['female'] = np.where(df['sex']== 'f', 1, 0) #female = 1, male = 0

In [13]:
#split in train and test data
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

In [14]:
train_df = pd.DataFrame({
     "label" : training_data["female"],
     "text" : training_data["essay0"]
})

In [15]:
test_df = pd.DataFrame({
     "label" : testing_data["female"],
     "text" : testing_data["essay0"]
})

In [16]:
test_df.head(2)

Unnamed: 0,label,text
25,0,"hey to all, hope all is well and your having a..."
27,0,"i suck at these things, but here it goes. i'm ..."


In [17]:
train_df.head(2)

Unnamed: 0,label,text
51487,1,hey hey what's goin on everyone? i'm 22 and li...
18,0,some of my favorite things: riding my motorcyc...


In [18]:
#We need this dataset structure
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [19]:
import datasets
from datasets import Dataset

In [20]:
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [21]:
my_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 43566
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 10892
    })
})

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [23]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [24]:
#Tokenize
tokenized_df = my_dataset_dict.map(preprocess_function, batched=True)



  0%|          | 0/44 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [25]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) #2 labels, because female and male 

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

1. Define your training hyperparameters in TrainingArguments.
2. Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator.
3. Call train() to fine-tune your model.

1. Define Training Hyperparameters in TrainingArguments
-> Includes all attributes to customize the training 

In [27]:
training_args = TrainingArguments(
    output_dir="./results", #save it in the folder results
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, #we start with 1 training epoch
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [28]:
trainer.train()

***** Running training *****
  Num examples = 43566
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2723
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


Step,Training Loss
500,0.5335
1000,0.4696
1500,0.4565
2000,0.444
2500,0.4326


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=2723, training_loss=0.4638698940718389, metrics={'train_runtime': 1730.8513, 'train_samples_per_second': 25.17, 'train_steps_per_second': 1.573, 'total_flos': 4793696070812112.0, 'train_loss': 0.4638698940718389, 'epoch': 1.0})

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

#Dictionary that maps the category in the dataframe into the id representation of our label 
labels = {"f": 0,          
          "m": 1}

In [None]:
df["essay0"] = df["essay0"].to_string(index = False)

#print(tokenizer(text, padding = "max_length", max_length = 512, truncation = True, return_tensors = "pt"))

In [None]:
#df['female'] = np.where(df['sex'] == "f", 1, 0)
#print(df["female"])
#df['male'] = np.where(df['sex'] == "m", 1, 0)
#print(df["male"])

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['sex']] 
        #Call tokenizer to transform input texts into the format BERT expects 
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 512, truncation=True, #512 is maximum length for tokens in 1 sequence
                                return_tensors="pt") for text in df['essay0']] #pt for pytorch

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
#my = Dataset(df)

In [None]:
#split data into training, test and validation
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                                         [int(.8 * len(df)), int(.9 * len(df))])

print(len(df_train), len(df_val), len(df_test))

800 100 100


In [None]:
df_train["sex"].shape #43566 

(800,)

In [None]:
df_train["essay0"].shape #43566

(800,)

In [None]:
#BUILD MODEL

from torch import nn
from transformers import BertModel

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1) #input Word embedding vector of 768, output: 1 (male/female) QUESTION: or is output 2, because 2 labels? Do I have to change sex in 2 variables both hot encoded?
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

#TRAINING

from torch.optim import Adam
from tqdm import tqdm


def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCEWithLogitsLoss() #Changed loss function because binary, before multiclass: CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')


EPOCHS = 1
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/400 [00:05<?, ?it/s]


ValueError: ignored

In [None]:
#QUESTION:  Target size (torch.Size([2])) must be the same as input size (torch.Size([2, 1])) -> How to have it the same? 

## Evaluate Model on Test Data

# Predict sex with topic probability vector from BERTTopic

In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn
import matplotlib as plt

1. Load processed data

In [None]:
#1. Target vector Y (sex)
df_topic_sex = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/processed/df_topic_sex.csv")

#2. Feature Vector X (topic probabilities)
probs_topic_df = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/processed/probs_topic_df.csv")

In [None]:
df_topic_sex

Unnamed: 0,Profile_text,most_probable_topic,Sex,GenderDummy_F
0,me: would love think kind intellectual: either...,-1,m,0
1,chef: means. 1. workaholic. 2. love cook regar...,126,m,0
2,"i'm ashamed much, writing public text online d...",-1,m,0
3,work library go school. . .,-1,m,0
4,"hey how's going? currently vague profile know,...",-1,m,0
...,...,...,...,...
54453,"vibrant, expressive, caring optimist. love peo...",-1,f,1
54454,i'm nick. never know write myself. i'm sure ha...,-1,m,0
54455,"hello! enjoy traveling, watching movies, hangi...",4,m,0
54456,"""all world balls integrity one take either awa...",-1,m,0


2. Do a train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(probs_topic_df, df_topic_sex["GenderDummy_F"], test_size=0.33, random_state=42) #random state to make it reproducible

In [None]:
y_train

48956    1
44255    1
54302    1
8892     1
30910    1
        ..
44732    0
54343    1
38158    1
860      0
15795    0
Name: GenderDummy_F, Length: 36486, dtype: int64

In [None]:
X_train
#we have 231 columns topics and 36486 user profile texts 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,221,222,223,224,225,226,227,228,229,230
48956,0.006437,0.002602,0.001857,0.002368,0.017177,0.005662,0.003787,0.002911,0.003560,0.002875,...,0.003777,0.009289,0.002541,0.004394,0.003110,0.003816,0.004816,0.003605,0.002723,0.003906
44255,0.015904,0.002583,0.002069,0.003215,0.004630,0.003914,0.003355,0.004172,0.001971,0.002839,...,0.014646,0.006029,0.004037,0.002470,0.006432,0.003507,0.005467,0.004518,0.003412,0.002078
54302,0.000468,0.000296,0.000265,0.000411,0.000379,0.001118,0.000764,0.000876,0.000182,0.000621,...,0.000442,0.000494,0.000305,0.000229,0.000582,0.000723,0.000663,0.000308,0.000540,0.000192
8892,0.002839,0.001696,0.001276,0.001463,0.005396,0.002859,0.002215,0.001690,0.013213,0.001648,...,0.002141,0.002678,0.001651,0.007751,0.001759,0.002064,0.002256,0.002172,0.001621,0.014132
30910,0.002335,0.001416,0.001050,0.001219,0.004245,0.002375,0.001846,0.001409,0.011326,0.001357,...,0.001797,0.002178,0.001390,0.007189,0.001471,0.001711,0.001872,0.001824,0.001355,0.011834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44732,0.001199,0.001421,0.001442,0.002190,0.001068,0.002222,0.001859,0.003232,0.000629,0.004768,...,0.001289,0.001310,0.001214,0.000742,0.001828,0.002096,0.001453,0.001090,0.002358,0.000656
54343,0.002473,0.002483,0.002200,0.003310,0.002241,0.004085,0.003064,0.004061,0.001166,0.011586,...,0.002399,0.003040,0.002110,0.001336,0.002960,0.004754,0.002766,0.002149,0.003610,0.001226
38158,0.002794,0.001788,0.001385,0.001576,0.004186,0.003187,0.002491,0.001842,0.013718,0.001734,...,0.002247,0.002524,0.001766,0.010930,0.001890,0.002174,0.002375,0.002191,0.001755,0.012003
860,0.111355,0.002536,0.002299,0.003165,0.009195,0.006392,0.004346,0.004104,0.002840,0.003422,...,0.006764,0.008418,0.003261,0.003478,0.004901,0.003860,0.006064,0.003771,0.003192,0.002986


### Convert X and y labels to numpy

In [None]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

In [None]:
X_test = X_test.to_numpy()
X_test = torch.from_numpy(X_test)

In [None]:
y_test = y_test.to_numpy()
y_test = torch.from_numpy(y_test)

### Make X and y labels tensors

In [None]:
X_train = torch.from_numpy(X_train)

In [None]:
X_train.shape

torch.Size([36486, 231])

In [None]:
y_train = torch.from_numpy(y_train)

In [None]:
type(y_train)

torch.Tensor

In [None]:
y_train.shape

torch.Size([36486])

In [None]:
X_train.shape, y_train.shape

(torch.Size([36486, 231]), torch.Size([36486]))

In [None]:
X_train[:5], y_train[:5]

(tensor([[0.0064, 0.0026, 0.0019,  ..., 0.0036, 0.0027, 0.0039],
         [0.0159, 0.0026, 0.0021,  ..., 0.0045, 0.0034, 0.0021],
         [0.0005, 0.0003, 0.0003,  ..., 0.0003, 0.0005, 0.0002],
         [0.0028, 0.0017, 0.0013,  ..., 0.0022, 0.0016, 0.0141],
         [0.0023, 0.0014, 0.0010,  ..., 0.0018, 0.0014, 0.0118]],
        dtype=torch.float64), tensor([1, 1, 1, 1, 1]))

In [None]:
import torch
import torch.nn as nn

## Pytorch Workflow

## 2. Create a model (input, output size, forward pass)

In [None]:
# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
# 1. Construct a model class that subclasses nn.Module
class NeuralNetwork_binary(nn.Module):
    def __init__(self):
        super().__init__()
        # 2. Create 2 nn.Linear layers capable of handling X and y input and output shapes
        self.layer_1 = nn.Linear(in_features=231, out_features=500) # takes in 231 features (X), produces 500 features QUESTION: How many output features here (meaning how many hidden layers?)
        self.layer_2 = nn.Linear(in_features=500, out_features=500)
        self.layer_3 = nn.Linear(in_features=500, out_features=1) # takes in 500 features, produces 1 feature (y)
        self.relu = nn.ReLU() # <- add in ReLU activation function

    # 3. Define a forward method containing the forward pass computation
    def forward(self, x):
        # Return the output of layer_2, a single feature, the same shape as y
        return self.layer_3(self.relu(self.layer_2(self.relu(self.layer_1(x))))) # computation goes through layer_1 first then the output of layer_1 goes through layer_2

In [None]:
# 4. Create an instance of the model and send it to target device
model_0 = NeuralNetwork_binary().to(device)
model_0

NeuralNetwork_binary(
  (layer_1): Linear(in_features=231, out_features=500, bias=True)
  (layer_2): Linear(in_features=500, out_features=500, bias=True)
  (layer_3): Linear(in_features=500, out_features=1, bias=True)
  (relu): ReLU()
)

2.) Construct loss and optimizer
Iterate this:
3.) Training Loop:
    - forward pass: compute prediction
    - backward pass: gradients
    - Update weights

## Define a Loss Function and Optimizer
Because we have a binary classification problem: Use binary cross entropy as loss function
We use Stochastic Gradient Descent as optimizer

In [None]:
# Create a loss function
# loss_fn = nn.BCELoss() # BCELoss = no sigmoid built-in
loss_fn = nn.BCEWithLogitsLoss()
# Create an optimizer
optimizer = torch.optim.SGD(params=model_0.parameters(),
                            lr=0.1)

## Define a function for calculating accuracy as evaluation metric

In [None]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100
    return acc

## Training the model
1. Forward Pass: Model goes through all of the training data once
2. Calculate the Loss
3. Set optimizer gradients to zero
4. Perform backpropagation on the Loss
5. Update the parameters with gradient descent

In [None]:
torch.manual_seed(42)

# Set the number of epochs
epochs = 100

# Put data to target device TODO: What does that mean?
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

In [None]:
# Build training and evaluation loop
for epoch in range(epochs):
    ### Training
    model_0.train()

    # 1. Forward pass (model outputs raw logits)
    y_logits = model_0(X_train.float()).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls

    # 2. Calculate loss/accuracy
    loss = loss_fn(y_logits,
                   y_train.float())
    acc = accuracy_fn(y_true=y_train.float(),
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_0.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_0(X_test.float()).squeeze()
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test.float())
        test_acc = accuracy_fn(y_true=y_test.float(),
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.69101, Accuracy: 59.85% | Test loss: 0.68970, Test acc: 60.15%
Epoch: 10 | Loss: 0.68218, Accuracy: 59.85% | Test loss: 0.68116, Test acc: 60.15%
Epoch: 20 | Loss: 0.67785, Accuracy: 59.85% | Test loss: 0.67690, Test acc: 60.15%
Epoch: 30 | Loss: 0.67570, Accuracy: 59.85% | Test loss: 0.67475, Test acc: 60.15%
Epoch: 40 | Loss: 0.67464, Accuracy: 59.85% | Test loss: 0.67365, Test acc: 60.15%
Epoch: 50 | Loss: 0.67412, Accuracy: 59.85% | Test loss: 0.67308, Test acc: 60.15%
Epoch: 60 | Loss: 0.67386, Accuracy: 59.85% | Test loss: 0.67279, Test acc: 60.15%
Epoch: 70 | Loss: 0.67373, Accuracy: 59.85% | Test loss: 0.67263, Test acc: 60.15%
Epoch: 80 | Loss: 0.67367, Accuracy: 59.85% | Test loss: 0.67254, Test acc: 60.15%
Epoch: 90 | Loss: 0.67364, Accuracy: 59.85% | Test loss: 0.67249, Test acc: 60.15%


In [None]:
#QUESTION: How to improve accuracy? Parameter optimization with Grid Search? How to decide on how many layers to have? 