# Connect Notebook to Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
root_path = "/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien"

# Install/Import packages 

In [None]:
!pip install transformers

In [None]:
!pip install datasets

  Attempting uninstall: multiprocess
    Found existing installation: multiprocess 0.70.13
    Uninstalling multiprocess-0.70.13:
      Successfully uninstalled multiprocess-0.70.13
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.2.2 dill-0.3.4 frozenlist-1.3.0 fsspec-2022.5.0 multidict-6.0.2 multiprocess-0.70.12.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2


In [None]:
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset
from datasets import load_metric

# Read in Dataset

In [None]:
df_raw = pd.read_csv("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/data/raw/okcupid_profiles.csv")
df_raw.head(5)
df = df_raw[["sex", "essay0"]]

In [None]:
df_raw.shape #59946, 31)
#df.groupby(["sex"]).size().plot.bar()

(59946, 31)

In [None]:
df_raw.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,


# Use BERT to predict text classification (female or male) 

Tutorials: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

https://huggingface.co/docs/transformers/tasks/sequence_classification

https://www.google.com/search?q=transfomrer+trainer.train+see+on+one+example&rlz=1C1CHBF_deDE761DE761&oq=transfomrer+trainer.train+see+on+one+example&aqs=chrome..69i57j33i10i160.9172j0j4&sourceid=chrome&ie=UTF-8#kpvalbx=_-AOaYvv9EMfasAeG0ovwDA15

BERT input Variables: 
* input_ids: id representation of each token (When decoded: "[CLS] text [SEP] [PAD]..."
* token_typ_ids: Binary mask that identifies in which sequence a token belongs, for a single sequence all token type ids are 0
* attention_mask: Binary mask that identifies whether a token is a real word or just padding




## Preprocess Data

In [None]:
! pip install datasets
#from datasets import load_dataset 
#imdb = load_dataset("imdb") #was used to see how our dataformat needs to look like

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Filter out NAs of essay0 (about me in profile text)
df = df.dropna(subset =  ["essay0"])
len(df["essay0"]) #54458, before: 59946

54458

In [None]:
#make sex a binary variable 
df['female'] = np.where(df['sex']== 'f', 1, 0) #female = 1, male = 0

In [None]:
df.head(2)

Unnamed: 0,sex,essay0,female
0,m,about me: i would love to think that i was so...,0
1,m,i am a chef: this is what that means. 1. i am ...,0


In [None]:
#split in train, test and validation data: 70% train, 15% test, 15% eval
training_data = df.sample(frac=0.7, random_state=25) #38,121 rows

testing_and_eval_data = df.drop(training_data.index) #30% = eval and test
testing_data = testing_and_eval_data.sample(frac=0.5, random_state=25) #of the 30% -> half is test, 8168 rows 
evaluation_data = testing_and_eval_data.drop(testing_data.index) #8169 rows

In [None]:
train_df = pd.DataFrame({
     "label" : training_data["female"],
     "text" : training_data["essay0"]
})

In [None]:
test_df = pd.DataFrame({
     "label" : testing_data["female"],
     "text" : testing_data["essay0"]
})

In [None]:
eval_df = pd.DataFrame({
     "label" : evaluation_data["female"],
     "text" : evaluation_data["essay0"]
})

In [None]:
test_df.head(2)
train_df.head(2)
eval_df.head(2)

Unnamed: 0,label,text
57,0,"i grew up in iowa. it gets a bad rap, but let ..."
65,0,i really like meeting new people. small-world ...


In [None]:
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)
eval_dataset = Dataset.from_dict(eval_df)
dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "eval": eval_dataset})

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 38121
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 8168
    })
    eval: Dataset({
        features: ['label', 'text'],
        num_rows: 8169
    })
})

## Tokenize the datasets 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
#Tokenize
tokenized_df = dataset_dict.map(preprocess_function, batched=True)



  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [None]:
tokenized_df

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 38121
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 8168
    })
    eval: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 8169
    })
})

## Use padding to make sure all have the same length 

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Load the pre-trained model: AutoModelSequenceClassification (for text classification)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)#2 labels, because female and male 

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

# Decide for a metric

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
metric_name = "accuracy"

**1. Define your training hyperparameters in TrainingArguments**

**2. Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator**

**3. Call train() to fine-tune your model**

In [None]:
training_args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results", #save model in my google drive
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, 
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
    
)

#do the same for eval data
#look at trainer methode, wann batch übergeben? output bekommen
#übergeb batch an model
#use 1 text example - if works, take whole eval dataset (for loop über alle daten, generate output, look at accuracy (e.g.))
#um besser optimieren: Test data nutzen, am Ende wenn trainiert: Validieren mit eval dataset (sonst bias) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics = compute_metrics #use accuracy metrics defined above
)


In [None]:
trainer.train()

***** Running training *****
  Num examples = 38121
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2383
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.438,0.432907,0.777302


***** Running Evaluation *****
  Num examples = 8168
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383
Configuration saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383/config.json
Model weights saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/checkpoint-2383/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/check

TrainOutput(global_step=2383, training_loss=0.4675337278787899, metrics={'train_runtime': 1616.4705, 'train_samples_per_second': 23.583, 'train_steps_per_second': 1.474, 'total_flos': 4209711732759420.0, 'train_loss': 0.4675337278787899, 'epoch': 1.0})

In [None]:
#push model to hub 
model.push_to_hub("my-finetuned-bert")

Cloning https://huggingface.co/NorrisPau/my-finetuned-bert into local empty directory.
Configuration saved in my-finetuned-bert/config.json
Model weights saved in my-finetuned-bert/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 3.34k/255M [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/NorrisPau/my-finetuned-bert
   26647c8..0ea1190  main -> main



'https://huggingface.co/NorrisPau/my-finetuned-bert/commit/0ea11901f08fe59388287577fa7a22847040c517'

## Evaluate the Model 

To evaluate the model, we need to import a metric. We use accuracy 

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8168
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.7857492654260528,
 'eval_loss': 0.44324198365211487,
 'eval_runtime': 121.3045,
 'eval_samples_per_second': 67.335,
 'eval_steps_per_second': 4.213}

#Reload saved model to run evaluation

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/BERT_3 epochs/fine_tuned_BERT_predict_sex_model")

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
from transformers import pipeline
classifier = pipeline(task = "sentiment-analysis", model = model,
                      tokenizer = tokenizer)

Check out prediction for one example

In [None]:
classifier("Hi, I study Data Science") #Men

[{'label': 'LABEL_0', 'score': 0.6896018385887146}]

In [None]:
eval_test = eval_dataset["text"]

In [None]:
#to run classifier, we need to truncate the dataset strings to 512 characters 
#reason to do that can be read here: https://github.com/huggingface/transformers/issues/14183
eval_test = [elem[:512] for elem in eval_test]

In [None]:
predictions = classifier(eval_test)

In [None]:
predictions[0:4]

[{'label': 'LABEL_0', 'score': 0.9980983138084412},
 {'label': 'LABEL_0', 'score': 0.8562405109405518},
 {'label': 'LABEL_1', 'score': 0.8592649698257446},
 {'label': 'LABEL_1', 'score': 0.6359561681747437}]

In [None]:
predictions_df = pd.DataFrame(predictions, columns=['label', 'score'])

In [None]:
predictions_df.head()

Unnamed: 0,label,score,female
0,LABEL_0,0.998098,0
1,LABEL_0,0.856241,0
2,LABEL_1,0.859265,1
3,LABEL_1,0.635956,1
4,LABEL_0,0.933948,0


In [None]:
predictions_df["female"] = np.where(predictions_df["label"] == "LABEL_1",1,0)

In [None]:
path = '//content/gdrive/MyDrive/Machine_Learning_NLP_Nora_Pauelsen_TU_Wien/model results/predictions_fine_tuned_model/predictions_eval_df.csv'

with open(path, 'w', encoding = 'utf-8-sig') as f:
  predictions_df.to_csv(f)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = predictions_df["female"]
y_true = eval_df["label"]

In [None]:
accuracy_score(y_true, y_pred)

0.770106500183621

In [None]:
text_eval = list(eval_df["text"])

In [None]:
import pandas as pd

In [None]:
eval_df["predicted_label"] = predictions_df["female"]

In [None]:
predictions_df

Unnamed: 0,label,score,female
0,LABEL_0,0.998098,0
1,LABEL_0,0.856241,0
2,LABEL_1,0.859265,1
3,LABEL_1,0.635956,1
4,LABEL_0,0.933948,0
...,...,...,...
8164,LABEL_0,0.981501,0
8165,LABEL_1,0.897249,1
8166,LABEL_1,0.996698,1
8167,LABEL_0,0.900165,0


In [None]:
eval_df.loc[eval_df["label"] != eval_df["predicted_label"]]

Unnamed: 0,label,text,predicted_label
65,0,i really like meeting new people. small-world ...,1.0
83,1,"i love many things, laughing however is at the...",0.0
98,1,one day i will mod r/hotchickswithspreadsheets...,0.0
101,0,from san diego to the bay six years ago. gave ...,1.0
108,1,i pride myself on having fun no matter where i...,0.0
...,...,...,...
59907,0,"my name is peter. i'm an oakland native, and i...",
59913,1,i have lived in sf off and on for 7.5 years no...,
59922,1,just kind of a silly girl. super geek. ultra n...,
59931,0,"born in southern india, grew up in dubai, live...",
