#### Logitic Regression with LeaveOneGroupOut Cross Validation using BERT

In [4]:
'''Import Libraries'''
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from joblib import load, Parallel, delayed
import regex as re
import pickle as pkl 
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")
from datasets import load_dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Sampler, BatchSampler, Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import AutoModel, AutoTokenizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score

In [5]:
#read the data
df = pd.read_pickle('data/dataset.pkl')

In [6]:
df.head()

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,count_president,count_minister,count_scribe
5,2003-06-04,Aktuelle Stunde,5,Heidrun Silhavy,SPÖ,1384.0,False,308.0,Herr Präsident! Herr Vizekanzler! Herr Bundes...,AT-Nationalrat,AUS,0,0,0
7,2003-06-04,Aktuelle Stunde,7,Heidrun Silhavy,SPÖ,1384.0,False,1385.0,"Selbstverständlich, Herr Präsident. Ich nehme...",AT-Nationalrat,AUS,0,0,0
9,2003-06-04,Aktuelle Stunde,9,Herbert Haupt,FPÖ,463.0,False,1562.0,Herr Präsident! Sehr geehrte Damen und Herren...,AT-Nationalrat,AUS,0,1,0
11,2003-06-04,Aktuelle Stunde,11,Walter Tancsits,ÖVP,1329.0,False,638.0,Herr Präsident! Herr Vizekanzler! Frau Staats...,AT-Nationalrat,AUS,0,0,0
13,2003-06-04,Aktuelle Stunde,13,Doris Bures,SPÖ,1384.0,False,836.0,Sehr geehrter Herr Präsident! Meine sehr geeh...,AT-Nationalrat,AUS,0,0,0


In [7]:
def regularized_f1(train_f1, dev_f1, threshold=0.0015):
    """
    Returns development F1 if overfitting is below threshold, otherwise 0.
    """
    return dev_f1 if (train_f1 - dev_f1) < threshold else 0


def save_metrics(*args, path, fname):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.isfile(path + fname):
        with open(path + fname, "w", newline="\n") as f:
            f.write(
                ",".join(
                    [
                        "config",
                        "epoch",
                        "train_loss",
                        "train_acc",
                        "train_f1",
                        "val_loss",
                        "val_acc",
                        "val_f1",
                    ]
                )
            )
            f.write("\n")
    if args:
        with open(path + fname, "a", newline="\n") as f:
            f.write(",".join([str(arg) for arg in args]))
            f.write("\n")

def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(1234)

In [8]:
VOCAB_SIZE = 5_000 #20_000
BATCH_SIZE = 32
NUM_EPOCHS = 15
MAX_LEN = 128 #256
LEARNING_RATE = 1e-4
seed_value = 43
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [9]:
'''
german BERT 
'''
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")

Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


link german BERT hugging face [https://huggingface.co/dbmdz/bert-base-german-cased]

In [16]:
#pip install tqdm boto3 requests regex sentencepiece sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
   ---------------------------------------- 897.5/897.5 kB 4.1 MB/s eta 0:00:00
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Note: you may need to restart the kernel to use updated packages.


In [18]:
#pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.1
    Uninstalling datasets-3.0.1:
      Successfully uninstalled datasets-3.0.1
Successfully installed datasets-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

Using cache found in C:\Users\sarah/.cache\torch\hub\huggingface_pytorch-transformers_main


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\sarah/.cache\\torch\\hub\\huggingface_pytorch-transformers_main\\hubconf.py'

In [15]:
'''
pytorch including multipe languages
'''
sequence_classification_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-cased-finetuned-mrpc')
sequence_classification_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased-finetuned-mrpc')


Using cache found in C:\Users\sarah/.cache\torch\hub\huggingface_pytorch-transformers_main


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\sarah/.cache\\torch\\hub\\huggingface_pytorch-transformers_main\\hubconf.py'

link pytorch transformer [https://pytorch.org/hub/huggingface_pytorch-transformers/]