# Connect Notebook to Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
root_path = "/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien"

# Install/Import packages 

In [None]:
!pip install transformers

In [None]:
!pip install datasets

  Attempting uninstall: multiprocess
    Found existing installation: multiprocess 0.70.13
    Uninstalling multiprocess-0.70.13:
      Successfully uninstalled multiprocess-0.70.13
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.2.2 dill-0.3.4 frozenlist-1.3.0 fsspec-2022.5.0 multidict-6.0.2 multiprocess-0.70.12.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2


In [None]:
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset
from datasets import load_metric

# Read in Dataset

In [None]:
df_raw = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/raw/okcupid_profiles.csv")
df_raw.head(5)
df = df_raw[["sex", "essay0"]]

In [None]:
df_raw.shape #59946, 31)
#df.groupby(["sex"]).size().plot.bar()

(59946, 31)

In [None]:
df_raw.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,


# Use BERT to predict text classification (female or male) 

Tutorials: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

https://huggingface.co/docs/transformers/tasks/sequence_classification

https://www.google.com/search?q=transfomrer+trainer.train+see+on+one+example&rlz=1C1CHBF_deDE761DE761&oq=transfomrer+trainer.train+see+on+one+example&aqs=chrome..69i57j33i10i160.9172j0j4&sourceid=chrome&ie=UTF-8#kpvalbx=_-AOaYvv9EMfasAeG0ovwDA15

BERT input Variables: 
* input_ids: id representation of each token (When decoded: "[CLS] text [SEP] [PAD]..."
* token_typ_ids: Binary mask that identifies in which sequence a token belongs, for a single sequence all token type ids are 0
* attention_mask: Binary mask that identifies whether a token is a real word or just padding




## Preprocess Data

In [None]:
! pip install datasets
#from datasets import load_dataset 
#imdb = load_dataset("imdb") #was used to see how our dataformat needs to look like

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Filter out NAs of essay0 (about me in profile text)
df = df.dropna(subset =  ["essay0"])
len(df["essay0"]) #54458, before: 59946

54458

In [None]:
#make sex a binary variable 
df['female'] = np.where(df['sex']== 'f', 1, 0) #female = 1, male = 0

In [None]:
df.head(2)

Unnamed: 0,sex,essay0,female
0,m,about me: i would love to think that i was so...,0
1,m,i am a chef: this is what that means. 1. i am ...,0


In [None]:
#split in train, test and validation data: 70% train, 15% test, 15% eval
training_data = df.sample(frac=0.7, random_state=25) #38,121 rows

testing_and_eval_data = df.drop(training_data.index) #30% = eval and test
testing_data = testing_and_eval_data.sample(frac=0.5, random_state=25) #of the 30% -> half is test, 8168 rows 
evaluation_data = testing_and_eval_data.drop(testing_data.index) #8169 rows

In [None]:
train_df = pd.DataFrame({
     "label" : training_data["female"],
     "text" : training_data["essay0"]
})

In [None]:
test_df = pd.DataFrame({
     "label" : testing_data["female"],
     "text" : testing_data["essay0"]
})

In [None]:
eval_df = pd.DataFrame({
     "label" : evaluation_data["female"],
     "text" : evaluation_data["essay0"]
})

In [None]:
test_df.head(2)
train_df.head(2)
eval_df.head(2)

Unnamed: 0,label,text
57,0,"i grew up in iowa. it gets a bad rap, but let ..."
65,0,i really like meeting new people. small-world ...


In [None]:
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)
eval_dataset = Dataset.from_dict(eval_df)
dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "eval": eval_dataset})

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 38121
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 8168
    })
    eval: Dataset({
        features: ['label', 'text'],
        num_rows: 8169
    })
})

## Tokenize the datasets 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
#Tokenize
tokenized_df = dataset_dict.map(preprocess_function, batched=True)



  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [None]:
tokenized_df

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 38121
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 8168
    })
    eval: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 8169
    })
})

## Use padding to make sure all have the same length 

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Load the pre-trained model: AutoModelSequenceClassification (for text classification)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)#2 labels, because female and male 

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

# Decide for a metric

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
metric_name = "accuracy"

**1. Define your training hyperparameters in TrainingArguments**

**2. Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator**

**3. Call train() to fine-tune your model**

In [None]:
training_args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results", #save model in my google drive
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, 
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
    
)

#do the same for eval data
#look at trainer methode, wann batch übergeben? output bekommen
#übergeb batch an model
#use 1 text example - if works, take whole eval dataset (for loop über alle daten, generate output, look at accuracy (e.g.))
#um besser optimieren: Test data nutzen, am Ende wenn trainiert: Validieren mit eval dataset (sonst bias) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics = compute_metrics #use accuracy metrics defined above
)


In [None]:
trainer.train()

***** Running training *****
  Num examples = 38121
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2383
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.438,0.432907,0.777302


***** Running Evaluation *****
  Num examples = 8168
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383
Configuration saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383/config.json
Model weights saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/check

TrainOutput(global_step=2383, training_loss=0.4675337278787899, metrics={'train_runtime': 1616.4705, 'train_samples_per_second': 23.583, 'train_steps_per_second': 1.474, 'total_flos': 4209711732759420.0, 'train_loss': 0.4675337278787899, 'epoch': 1.0})

In [None]:
#push model to hub 
model.push_to_hub("my-finetuned-bert")

Cloning https://huggingface.co/NorrisPau/my-finetuned-bert into local empty directory.
Configuration saved in my-finetuned-bert/config.json
Model weights saved in my-finetuned-bert/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 3.34k/255M [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/NorrisPau/my-finetuned-bert
   26647c8..0ea1190  main -> main



'https://huggingface.co/NorrisPau/my-finetuned-bert/commit/0ea11901f08fe59388287577fa7a22847040c517'

## Evaluate the Model 

To evaluate the model, we need to import a metric. We use accuracy 

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8168
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.7857492654260528,
 'eval_loss': 0.44324198365211487,
 'eval_runtime': 121.3045,
 'eval_samples_per_second': 67.335,
 'eval_steps_per_second': 4.213}

#Reload saved model to run evaluation

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/BERT_3 epochs/fine_tuned_BERT_predict_sex_model")

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
from transformers import pipeline
classifier = pipeline(task = "sentiment-analysis", model = model,
                      tokenizer = tokenizer)

Check out prediction for one example

In [None]:
classifier("Hi, I study Data Science") #Men

[{'label': 'LABEL_0', 'score': 0.6896018385887146}]

In [None]:
eval_test = eval_dataset["text"]

In [None]:
#to run classifier, we need to truncate the dataset strings to 512 characters 
#reason to do that can be read here: https://github.com/huggingface/transformers/issues/14183
eval_test = [elem[:512] for elem in eval_test]

In [None]:
predictions = classifier(eval_test)

In [None]:
predictions[0:4]

[{'label': 'LABEL_0', 'score': 0.9980983138084412},
 {'label': 'LABEL_0', 'score': 0.8562405109405518},
 {'label': 'LABEL_1', 'score': 0.8592649698257446},
 {'label': 'LABEL_1', 'score': 0.6359561681747437}]

In [None]:
predictions_df = pd.DataFrame(predictions, columns=['label', 'score'])

In [None]:
predictions_df.head()

Unnamed: 0,label,score,female
0,LABEL_0,0.998098,0
1,LABEL_0,0.856241,0
2,LABEL_1,0.859265,1
3,LABEL_1,0.635956,1
4,LABEL_0,0.933948,0


In [None]:
predictions_df["female"] = np.where(predictions_df["label"] == "LABEL_1",1,0)

In [None]:
path = '//content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/predictions_fine_tuned_model/predictions_eval_df.csv'

with open(path, 'w', encoding = 'utf-8-sig') as f:
  predictions_df.to_csv(f)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = predictions_df["female"]
y_true = eval_df["label"]

In [None]:
accuracy_score(y_true, y_pred)

0.770106500183621

In [None]:
text_eval = list(eval_df["text"])

In [None]:
import pandas as pd

In [None]:
eval_df["predicted_label"] = predictions_df["female"]

In [None]:
predictions_df

Unnamed: 0,label,score,female
0,LABEL_0,0.998098,0
1,LABEL_0,0.856241,0
2,LABEL_1,0.859265,1
3,LABEL_1,0.635956,1
4,LABEL_0,0.933948,0
...,...,...,...
8164,LABEL_0,0.981501,0
8165,LABEL_1,0.897249,1
8166,LABEL_1,0.996698,1
8167,LABEL_0,0.900165,0


In [None]:
eval_df.loc[eval_df["label"] != eval_df["predicted_label"]]

Unnamed: 0,label,text,predicted_label
65,0,i really like meeting new people. small-world ...,1.0
83,1,"i love many things, laughing however is at the...",0.0
98,1,one day i will mod r/hotchickswithspreadsheets...,0.0
101,0,from san diego to the bay six years ago. gave ...,1.0
108,1,i pride myself on having fun no matter where i...,0.0
...,...,...,...
59907,0,"my name is peter. i'm an oakland native, and i...",
59913,1,i have lived in sf off and on for 7.5 years no...,
59922,1,just kind of a silly girl. super geek. ultra n...,
59931,0,"born in southern india, grew up in dubai, live...",


# Predict sex with topic probability vector from BERTTopic

In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn
import matplotlib as plt

1. Load processed data

In [None]:
#1. Target vector Y (sex)
df_topic_sex = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/processed/df_topic_sex.csv")

#2. Feature Vector X (topic probabilities)
probs_topic_df = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/processed/probs_topic_df.csv")

In [None]:
df_topic_sex

Unnamed: 0,Profile_text,most_probable_topic,Sex,GenderDummy_F
0,me: would love think kind intellectual: either...,-1,m,0
1,chef: means. 1. workaholic. 2. love cook regar...,126,m,0
2,"i'm ashamed much, writing public text online d...",-1,m,0
3,work library go school. . .,-1,m,0
4,"hey how's going? currently vague profile know,...",-1,m,0
...,...,...,...,...
54453,"vibrant, expressive, caring optimist. love peo...",-1,f,1
54454,i'm nick. never know write myself. i'm sure ha...,-1,m,0
54455,"hello! enjoy traveling, watching movies, hangi...",4,m,0
54456,"""all world balls integrity one take either awa...",-1,m,0


2. Do a train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(probs_topic_df, df_topic_sex["GenderDummy_F"], test_size=0.33, random_state=42) #random state to make it reproducible

In [None]:
y_train

48956    1
44255    1
54302    1
8892     1
30910    1
        ..
44732    0
54343    1
38158    1
860      0
15795    0
Name: GenderDummy_F, Length: 36486, dtype: int64

In [None]:
X_train
#we have 231 columns topics and 36486 user profile texts 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,221,222,223,224,225,226,227,228,229,230
48956,0.006437,0.002602,0.001857,0.002368,0.017177,0.005662,0.003787,0.002911,0.003560,0.002875,...,0.003777,0.009289,0.002541,0.004394,0.003110,0.003816,0.004816,0.003605,0.002723,0.003906
44255,0.015904,0.002583,0.002069,0.003215,0.004630,0.003914,0.003355,0.004172,0.001971,0.002839,...,0.014646,0.006029,0.004037,0.002470,0.006432,0.003507,0.005467,0.004518,0.003412,0.002078
54302,0.000468,0.000296,0.000265,0.000411,0.000379,0.001118,0.000764,0.000876,0.000182,0.000621,...,0.000442,0.000494,0.000305,0.000229,0.000582,0.000723,0.000663,0.000308,0.000540,0.000192
8892,0.002839,0.001696,0.001276,0.001463,0.005396,0.002859,0.002215,0.001690,0.013213,0.001648,...,0.002141,0.002678,0.001651,0.007751,0.001759,0.002064,0.002256,0.002172,0.001621,0.014132
30910,0.002335,0.001416,0.001050,0.001219,0.004245,0.002375,0.001846,0.001409,0.011326,0.001357,...,0.001797,0.002178,0.001390,0.007189,0.001471,0.001711,0.001872,0.001824,0.001355,0.011834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44732,0.001199,0.001421,0.001442,0.002190,0.001068,0.002222,0.001859,0.003232,0.000629,0.004768,...,0.001289,0.001310,0.001214,0.000742,0.001828,0.002096,0.001453,0.001090,0.002358,0.000656
54343,0.002473,0.002483,0.002200,0.003310,0.002241,0.004085,0.003064,0.004061,0.001166,0.011586,...,0.002399,0.003040,0.002110,0.001336,0.002960,0.004754,0.002766,0.002149,0.003610,0.001226
38158,0.002794,0.001788,0.001385,0.001576,0.004186,0.003187,0.002491,0.001842,0.013718,0.001734,...,0.002247,0.002524,0.001766,0.010930,0.001890,0.002174,0.002375,0.002191,0.001755,0.012003
860,0.111355,0.002536,0.002299,0.003165,0.009195,0.006392,0.004346,0.004104,0.002840,0.003422,...,0.006764,0.008418,0.003261,0.003478,0.004901,0.003860,0.006064,0.003771,0.003192,0.002986


### Convert X and y labels to numpy

In [None]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

In [None]:
X_test = X_test.to_numpy()
X_test = torch.from_numpy(X_test)

In [None]:
y_test = y_test.to_numpy()
y_test = torch.from_numpy(y_test)

### Make X and y labels tensors

In [None]:
X_train = torch.from_numpy(X_train)

In [None]:
X_train.shape

torch.Size([36486, 231])

In [None]:
y_train = torch.from_numpy(y_train)

In [None]:
type(y_train)

torch.Tensor

In [None]:
y_train.shape

torch.Size([36486])

In [None]:
X_train.shape, y_train.shape

(torch.Size([36486, 231]), torch.Size([36486]))

In [None]:
X_train[:5], y_train[:5]

(tensor([[0.0064, 0.0026, 0.0019,  ..., 0.0036, 0.0027, 0.0039],
         [0.0159, 0.0026, 0.0021,  ..., 0.0045, 0.0034, 0.0021],
         [0.0005, 0.0003, 0.0003,  ..., 0.0003, 0.0005, 0.0002],
         [0.0028, 0.0017, 0.0013,  ..., 0.0022, 0.0016, 0.0141],
         [0.0023, 0.0014, 0.0010,  ..., 0.0018, 0.0014, 0.0118]],
        dtype=torch.float64), tensor([1, 1, 1, 1, 1]))

In [None]:
import torch
import torch.nn as nn

## Pytorch Workflow

## 2. Create a model (input, output size, forward pass)

In [None]:
# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
# 1. Construct a model class that subclasses nn.Module
class NeuralNetwork_binary(nn.Module):
    def __init__(self):
        super().__init__()
        # 2. Create 2 nn.Linear layers capable of handling X and y input and output shapes
        self.layer_1 = nn.Linear(in_features=231, out_features=500) # takes in 231 features (X), produces 500 features QUESTION: How many output features here (meaning how many hidden layers?)
        self.layer_2 = nn.Linear(in_features=500, out_features=500)
        self.layer_3 = nn.Linear(in_features=500, out_features=1) # takes in 500 features, produces 1 feature (y)
        self.relu = nn.ReLU() # <- add in ReLU activation function

    # 3. Define a forward method containing the forward pass computation
    def forward(self, x):
        # Return the output of layer_2, a single feature, the same shape as y
        return self.layer_3(self.relu(self.layer_2(self.relu(self.layer_1(x))))) # computation goes through layer_1 first then the output of layer_1 goes through layer_2

In [None]:
# 4. Create an instance of the model and send it to target device
model_0 = NeuralNetwork_binary().to(device)
model_0

NeuralNetwork_binary(
  (layer_1): Linear(in_features=231, out_features=500, bias=True)
  (layer_2): Linear(in_features=500, out_features=500, bias=True)
  (layer_3): Linear(in_features=500, out_features=1, bias=True)
  (relu): ReLU()
)

2.) Construct loss and optimizer
Iterate this:
3.) Training Loop:
    - forward pass: compute prediction
    - backward pass: gradients
    - Update weights

## Define a Loss Function and Optimizer
Because we have a binary classification problem: Use binary cross entropy as loss function
We use Stochastic Gradient Descent as optimizer

In [None]:
# Create a loss function
# loss_fn = nn.BCELoss() # BCELoss = no sigmoid built-in
loss_fn = nn.BCEWithLogitsLoss()
# Create an optimizer
optimizer = torch.optim.SGD(params=model_0.parameters(),
                            lr=0.1)

## Define a function for calculating accuracy as evaluation metric

In [None]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100
    return acc

## Training the model
1. Forward Pass: Model goes through all of the training data once
2. Calculate the Loss
3. Set optimizer gradients to zero
4. Perform backpropagation on the Loss
5. Update the parameters with gradient descent

In [None]:
torch.manual_seed(42)

# Set the number of epochs
epochs = 100

# Put data to target device TODO: What does that mean?
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

In [None]:
# Build training and evaluation loop
for epoch in range(epochs):
    ### Training
    model_0.train()

    # 1. Forward pass (model outputs raw logits)
    y_logits = model_0(X_train.float()).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls

    # 2. Calculate loss/accuracy
    loss = loss_fn(y_logits,
                   y_train.float())
    acc = accuracy_fn(y_true=y_train.float(),
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_0.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_0(X_test.float()).squeeze()
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test.float())
        test_acc = accuracy_fn(y_true=y_test.float(),
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.69101, Accuracy: 59.85% | Test loss: 0.68970, Test acc: 60.15%
Epoch: 10 | Loss: 0.68218, Accuracy: 59.85% | Test loss: 0.68116, Test acc: 60.15%
Epoch: 20 | Loss: 0.67785, Accuracy: 59.85% | Test loss: 0.67690, Test acc: 60.15%
Epoch: 30 | Loss: 0.67570, Accuracy: 59.85% | Test loss: 0.67475, Test acc: 60.15%
Epoch: 40 | Loss: 0.67464, Accuracy: 59.85% | Test loss: 0.67365, Test acc: 60.15%
Epoch: 50 | Loss: 0.67412, Accuracy: 59.85% | Test loss: 0.67308, Test acc: 60.15%
Epoch: 60 | Loss: 0.67386, Accuracy: 59.85% | Test loss: 0.67279, Test acc: 60.15%
Epoch: 70 | Loss: 0.67373, Accuracy: 59.85% | Test loss: 0.67263, Test acc: 60.15%
Epoch: 80 | Loss: 0.67367, Accuracy: 59.85% | Test loss: 0.67254, Test acc: 60.15%
Epoch: 90 | Loss: 0.67364, Accuracy: 59.85% | Test loss: 0.67249, Test acc: 60.15%


In [None]:
#QUESTION: How to improve accuracy? Parameter optimization with Grid Search? How to decide on how many layers to have? 