## testing ai4p model

In [None]:
from transformers import pipeline
import torch

In [None]:
gen = pipeline("token-classification", "Isotonic/distilbert_finetuned_ai4privacy", device=-1, torch_dtype=torch.float32)

In [None]:
text = "My name is Claraly and I live in Berkeley, California."

In [None]:
output = gen(text, aggregation_strategy="simple")

In [None]:
output

In [None]:
def replace_entities(output, text):
    word_to_entity_group = dict(
    (text[token["start"] : token["end"]], token["entity_group"]) for token in output
)
    for i, token in enumerate(output):
        word = list(word_to_entity_group.keys())[i]
        text = text.replace(word, f"[{word_to_entity_group[word]}]")

    return text

In [None]:
text = """These instructions apply to section-based themes (Responsive 6.0+, Retina 4.0+, Parallax 3.0+ Turbo 2.0+, Mobilia 5.0+). What theme version am I using? On your Collections pages & Featured Collections sections, you can easily show the secondary image of a product on hover by enabling one of the theme's built-in settings! Your Collection pages & Featured Collections sections will now display the secondary product image just by hovering over that product image thumbnail. Does this feature apply to all sections of the theme or just specific ones as listed in the text material?"""

In [None]:
replace_entities(output, text)

In [None]:
from transformers import pipeline

In [None]:
from transformers import Pipeline

## Adapters Testrun

In [None]:
import transformers
from adapters import AutoAdapterModel
import adapters.composition as ac


model = AutoAdapterModel.from_pretrained("roberta-base")
wnut_17 = model.load_adapter("AdapterHub/roberta-base-pf-wnut_17", source="hf")
conll2003 = model.load_adapter("AdapterHub/roberta-base-pf-conll2003", source="hf")

model.active_adapters = ac.Parallel(wnut_17, conll2003)

In [None]:
model

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("roberta-base")

In [None]:
model

In [None]:
text = "My name is Sarah and I live in Berkeley, California."
inputs = tokenizer(text, return_tensors="pt")

In [None]:
inputs = tokenizer(text, return_tensors="pt")

In [None]:
output1, output2  = model(**inputs)

In [None]:
model.get_labels_dict('wnut_17'), model.get_labels_dict('conll2003')

In [None]:
import torch

def analyze_sentence(sentence):
  tokens = tokenizer.tokenize(sentence)
  input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokens))
  outputs = model(input_ids)

  # Post-process NER output wnut_17
  ner_labels_map = model.get_labels_dict("wnut_17")
  ner_label_ids = torch.argmax(outputs[0].logits, dim=2).numpy().squeeze().tolist()
  ner_labels = [ner_labels_map[id_] for id_ in ner_label_ids]
  annotated = []
  for token, label_id in zip(tokens, ner_label_ids):
    token = token.replace('\u0120', '')
    label = ner_labels_map[label_id]
    annotated.append(f"{token}<{label}>" if label != "O" else token)
  print("NER Wnut_17: " + " ".join(annotated))

  # Post-process NER output conll2003
  ner_labels_map = model.get_labels_dict("conll2003")
  ner_label_ids = torch.argmax(outputs[1].logits, dim=2).numpy().squeeze().tolist()
  ner_labels = [ner_labels_map[id_] for id_ in ner_label_ids]
  annotated = []
  for token, label_id in zip(tokens, ner_label_ids):
    token = token.replace('\u0120', '')
    label = ner_labels_map[label_id]
    annotated.append(f"{token}<{label}>" if label != "O" else token)
  print("NER conll2003: " + " ".join(annotated))

  # # Post-process classifier output
  # classifier_labels = model.get_labels_dict(classifier_adapter)
  # label_id = torch.argmax(outputs[1].logits).item()
  # print("Classifier: " + classifier_labels[label_id])
  # print()

In [None]:
analyze_sentence(text)

In [None]:
sentences = [
  "A man in central Germany tried to leave his house by the front door only to find a brick wall there.",
  "The Met Office has issued a yellow weather warning for ice across most of Wales.",
  "A vibrant animation telling stories of indigenous Australia will be projected on to the Sydney Opera House every night at sunset."
]

for sentence in sentences:
  analyze_sentence(sentence)

## Lora parallel inference

In [None]:
import transformers
from adapters import AutoAdapterModel
import adapters.composition as ac


model = AutoAdapterModel.from_pretrained("roberta-base")
wnut_17 = model.load_adapter("AdapterHub/roberta-base-pf-wnut_17", source="hf")
conll2003 = model.load_adapter("AdapterHub/roberta-base-pf-conll2003", source="hf")

model.active_adapters = ac.Parallel(wnut_17, conll2003)

In [None]:
model.config

## transposition

In [3]:
id2label = {
    "0": "O",
    "1": "B-PHONEIMEI",
    "2": "I-PHONEIMEI",
    "3": "B-JOBAREA",
    "4": "B-FIRSTNAME",
    "5": "I-FIRSTNAME",
    "6": "B-VEHICLEVIN",
    "7": "I-VEHICLEVIN",
    "8": "B-AGE",
    "9": "B-GENDER",
    "10": "I-GENDER",
    "11": "B-HEIGHT",
    "12": "I-HEIGHT",
    "13": "B-BUILDINGNUMBER",
    "14": "I-BUILDINGNUMBER",
    "15": "B-MASKEDNUMBER",
    "16": "I-MASKEDNUMBER",
    "17": "B-PASSWORD",
    "18": "I-PASSWORD",
    "19": "B-DOB",
    "20": "I-DOB",
    "21": "B-IPV6",
    "22": "I-IPV6",
    "23": "B-NEARBYGPSCOORDINATE",
    "24": "I-NEARBYGPSCOORDINATE",
    "25": "B-USERAGENT",
    "26": "I-USERAGENT",
    "27": "B-TIME",
    "28": "I-TIME",
    "29": "B-JOBTITLE",
    "30": "I-JOBTITLE",
    "31": "B-COUNTY",
    "32": "B-EMAIL",
    "33": "I-EMAIL",
    "34": "B-ACCOUNTNUMBER",
    "35": "I-ACCOUNTNUMBER",
    "36": "B-PIN",
    "37": "I-PIN",
    "38": "B-EYECOLOR",
    "39": "I-EYECOLOR",
    "40": "B-LASTNAME",
    "41": "I-LASTNAME",
    "42": "I-JOBAREA",
    "43": "B-IPV4",
    "44": "I-IPV4",
    "45": "B-DATE",
    "46": "I-DATE",
    "47": "B-STREET",
    "48": "I-STREET",
    "49": "B-CITY",
    "50": "I-CITY",
    "51": "B-PREFIX",
    "52": "I-PREFIX",
    "53": "B-CREDITCARDISSUER",
    "54": "B-CREDITCARDNUMBER",
    "55": "I-CREDITCARDNUMBER",
    "56": "I-CREDITCARDISSUER",
    "57": "B-MIDDLENAME",
    "58": "B-STATE",
    "59": "I-STATE",
    "60": "B-VEHICLEVRM",
    "61": "I-VEHICLEVRM",
    "62": "B-ORDINALDIRECTION",
    "63": "B-SEX",
    "64": "B-JOBTYPE",
    "65": "I-JOBTYPE",
    "66": "B-CURRENCYCODE",
    "67": "I-CURRENCYCODE",
    "68": "B-CURRENCYSYMBOL",
    "69": "I-AMOUNT",
    "70": "B-ACCOUNTNAME",
    "71": "I-ACCOUNTNAME",
    "72": "B-BITCOINADDRESS",
    "73": "I-BITCOINADDRESS",
    "74": "B-LITECOINADDRESS",
    "75": "I-LITECOINADDRESS",
    "76": "B-PHONENUMBER",
    "77": "I-PHONENUMBER",
    "78": "B-MAC",
    "79": "I-MAC",
    "80": "B-CURRENCY",
    "81": "B-IBAN",
    "82": "I-IBAN",
    "83": "B-COMPANYNAME",
    "84": "I-COMPANYNAME",
    "85": "B-CURRENCYNAME",
    "86": "I-CURRENCYNAME",
    "87": "I-CURRENCYSYMBOL",
    "88": "B-ZIPCODE",
    "89": "I-ZIPCODE",
    "90": "B-SSN",
    "91": "I-SSN",
    "92": "B-AMOUNT",
    "93": "I-CURRENCY",
    "94": "B-URL",
    "95": "I-URL",
    "96": "B-IP",
    "97": "I-IP",
    "98": "B-SECONDARYADDRESS",
    "99": "I-SECONDARYADDRESS",
    "100": "B-USERNAME",
    "101": "I-USERNAME",
    "102": "B-ETHEREUMADDRESS",
    "103": "I-ETHEREUMADDRESS",
    "104": "B-CREDITCARDCVV",
    "105": "I-CREDITCARDCVV",
    "106": "I-COUNTY",
    "107": "I-AGE",
    "108": "I-MIDDLENAME",
    "109": "B-BIC",
    "110": "I-BIC"
  },

label2id = {
    "B-ACCOUNTNAME": 70,
    "B-ACCOUNTNUMBER": 34,
    "B-AGE": 8,
    "B-AMOUNT": 92,
    "B-BIC": 109,
    "B-BITCOINADDRESS": 72,
    "B-BUILDINGNUMBER": 13,
    "B-CITY": 49,
    "B-COMPANYNAME": 83,
    "B-COUNTY": 31,
    "B-CREDITCARDCVV": 104,
    "B-CREDITCARDISSUER": 53,
    "B-CREDITCARDNUMBER": 54,
    "B-CURRENCY": 80,
    "B-CURRENCYCODE": 66,
    "B-CURRENCYNAME": 85,
    "B-CURRENCYSYMBOL": 68,
    "B-DATE": 45,
    "B-DOB": 19,
    "B-EMAIL": 32,
    "B-ETHEREUMADDRESS": 102,
    "B-EYECOLOR": 38,
    "B-FIRSTNAME": 4,
    "B-GENDER": 9,
    "B-HEIGHT": 11,
    "B-IBAN": 81,
    "B-IP": 96,
    "B-IPV4": 43,
    "B-IPV6": 21,
    "B-JOBAREA": 3,
    "B-JOBTITLE": 29,
    "B-JOBTYPE": 64,
    "B-LASTNAME": 40,
    "B-LITECOINADDRESS": 74,
    "B-MAC": 78,
    "B-MASKEDNUMBER": 15,
    "B-MIDDLENAME": 57,
    "B-NEARBYGPSCOORDINATE": 23,
    "B-ORDINALDIRECTION": 62,
    "B-PASSWORD": 17,
    "B-PHONEIMEI": 1,
    "B-PHONENUMBER": 76,
    "B-PIN": 36,
    "B-PREFIX": 51,
    "B-SECONDARYADDRESS": 98,
    "B-SEX": 63,
    "B-SSN": 90,
    "B-STATE": 58,
    "B-STREET": 47,
    "B-TIME": 27,
    "B-URL": 94,
    "B-USERAGENT": 25,
    "B-USERNAME": 100,
    "B-VEHICLEVIN": 6,
    "B-VEHICLEVRM": 60,
    "B-ZIPCODE": 88,
    "I-ACCOUNTNAME": 71,
    "I-ACCOUNTNUMBER": 35,
    "I-AGE": 107,
    "I-AMOUNT": 69,
    "I-BIC": 110,
    "I-BITCOINADDRESS": 73,
    "I-BUILDINGNUMBER": 14,
    "I-CITY": 50,
    "I-COMPANYNAME": 84,
    "I-COUNTY": 106,
    "I-CREDITCARDCVV": 105,
    "I-CREDITCARDISSUER": 56,
    "I-CREDITCARDNUMBER": 55,
    "I-CURRENCY": 93,
    "I-CURRENCYCODE": 67,
    "I-CURRENCYNAME": 86,
    "I-CURRENCYSYMBOL": 87,
    "I-DATE": 46,
    "I-DOB": 20,
    "I-EMAIL": 33,
    "I-ETHEREUMADDRESS": 103,
    "I-EYECOLOR": 39,
    "I-FIRSTNAME": 5,
    "I-GENDER": 10,
    "I-HEIGHT": 12,
    "I-IBAN": 82,
    "I-IP": 97,
    "I-IPV4": 44,
    "I-IPV6": 22,
    "I-JOBAREA": 42,
    "I-JOBTITLE": 30,
    "I-JOBTYPE": 65,
    "I-LASTNAME": 41,
    "I-LITECOINADDRESS": 75,
    "I-MAC": 79,
    "I-MASKEDNUMBER": 16,
    "I-MIDDLENAME": 108,
    "I-NEARBYGPSCOORDINATE": 24,
    "I-PASSWORD": 18,
    "I-PHONEIMEI": 2,
    "I-PHONENUMBER": 77,
    "I-PIN": 37,
    "I-PREFIX": 52,
    "I-SECONDARYADDRESS": 99,
    "I-SSN": 91,
    "I-STATE": 59,
    "I-STREET": 48,
    "I-TIME": 28,
    "I-URL": 95,
    "I-USERAGENT": 26,
    "I-USERNAME": 101,
    "I-VEHICLEVIN": 7,
    "I-VEHICLEVRM": 61,
    "I-ZIPCODE": 89,
    "O": 0
  },

In [1]:
import transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer
from peft import PeftConfig, PeftModelForTokenClassification, get_peft_model, PeftMixedModel
from adapters import AutoAdapterModel
import adapters.composition as ac 

In [2]:
peft_en = "Isotonic/ai4privacy_v2_adapter_en"
peft_it = "Isotonic/ai4privacy_v2_adapter_it"
peft_config_en = PeftConfig.from_pretrained(peft_en)
peft_config_it = PeftConfig.from_pretrained(peft_it)

In [None]:
peft_config_en.to_dict()

In [4]:
peft_config_en.bias = 'none'
peft_config_it.bias = 'none'

In [5]:
base_model_name_or_path = "distilbert-base-multilingual-cased"

In [6]:
base_model = AutoModelForTokenClassification.from_pretrained(base_model_name_or_path, id2label=id2label, label2id=label2id, num_labels=len(id2label))
tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
peft_model = PeftMixedModel.from_pretrained(base_model, model_id=peft_en, config=peft_config_en, adapter_name="lora_en", id2label=id2label, label2id=label2id, num_labels=len(id2label)).eval()
peft_model.merge_and_unload()
# output = peft_model.add_adapter(adapter_name="lora_it", peft_config=peft_config_it)
peft_model.set_adapter(["lora_en"])
# peft_model.set_adapter(["lora_en", "lora_it"])  # activate both adapters

In [55]:
peft_model.active_adapters

['lora_en']

In [56]:
text = "My name is Sarah and I live in Berkeley, California."
inputs = tokenizer(text, return_tensors="pt")

In [57]:
outputs = peft_model(**inputs)

In [58]:
with torch.no_grad():
    logits = peft_model(**inputs).logits

tokens = inputs.tokens()
predictions = torch.argmax(logits, dim=2)

In [59]:
for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, id2label[str(prediction)]))

('[CLS]', 'O')
('My', 'O')
('name', 'O')
('is', 'O')
('Sarah', 'O')
('and', 'O')
('I', 'O')
('live', 'O')
('in', 'O')
('Berkeley', 'O')
(',', 'O')
('California', 'O')
('.', 'O')
('[SEP]', 'O')


In [17]:
import torch

def analyze_sentence(sentence):
  tokens = sentence.split()
  inputs = tokenizer(text, return_tensors="pt")
  outputs = peft_model(**inputs)

  # Post-process NER output en
  # ner_labels_map = base_model.get_labels_dict(peft_model_en)
  ner_label_ids = torch.argmax(outputs[0], dim=2).numpy().squeeze().tolist()
  annotated = []
  for token, label_id in zip(tokens, outputs[0]):
    token = token.replace('\u0120', '')
    label = id2label[label_id]
    annotated.append(f"{token}<{label}>" if label != "O" else token)
  print("NER EN: " + " ".join(annotated))

In [None]:
analyze_sentence(text)

In [None]:
from peft import get_peft_model

model_en = get_peft_model(base_model, peft_config_en, mixed=True, adapter_name="peft_en")
model_it = get_peft_model(base_model, peft_config_it, mixed=True, adapter_name="peft_it")

In [None]:
model_en = model_en.merge_and_unload()
model_it = model_it.merge_and_unload()

In [None]:
model_en.load_adapter(peft_it, adapter_name="peft_it")
model_en.set_adapter(["peft_en", "peft_it"])

In [None]:
en_model = PeftModelForTokenClassification.from_pretrained(base_model, peft_en, id2label=id2label, label2id=label2id, num_labels=len(id2label), config=peft_config_en)

In [None]:
from peft import PeftMixedModel

base_model = ...  # load the base model, e.g. from transformers
# load first adapter, which will be called "default"
peft_model = PeftMixedModel.from_pretrained(base_model_name_or_path, peft_en, id2label=id2label, label2id=label2id, num_labels=len(id2label), config=peft_config_en)
peft_model.load_adapter(peft_it, adapter_name="peft_it", id2label=id2label, label2id=label2id, num_labels=len(id2label), config=peft_config_it)


In [None]:
peft_model.set_adapter(["default", "other"])

In [None]:
# import safetensors
# import torch

# pt_state_dict = safetensors.torch.load_file("/Users/sripaadsrinivasan/Projects/ai4privacy/notebooks/ai4p_adapter_en/adapter_model.safetensors", device="cpu")
# torch.save(pt_state_dict, "/Users/sripaadsrinivasan/Projects/ai4privacy/notebooks/ai4p_adapter_en/pytorch_adapter.bin")

In [None]:
from adapters import AutoAdapterModel
import adapters.composition as ac


base_model = AutoAdapterModel.from_pretrained(base_model_name_or_path, id2label=id2label, label2id=label2id, num_labels=len(id2label))

In [None]:
import adapters

adapters.init(base_model)

In [None]:
peft_config_en.to_dict()

In [None]:

# from adapters import AdapterSetup, AutoAdapterModel, LoRAConfig
# import adapters.composition as ac

# model = AutoAdapterModel.from_pretrained(base_model_name_or_path, num_labels=len(id2label), id2label=id2label)

# qc = model.load_adapter(peft_model_en)
# sent = model.load_adapter(peft_model_it)

# with AdapterSetup(ac.Parallel([qc, sent])):
#     print(model(**tokenizer("What is AdapterHub?", return_tensors="pt")))

In [None]:
peft_en

In [None]:
# from adapters import LoRAConfig

# config = LoRAConfig(architecture = "lora", r=320, alpha=384, attn_matrices=["q_lin","k_lin","v_lin", "out_lin"])
# base_model.load_adapter(adapter_name_or_path="/Users/sripaadsrinivasan/Projects/ai4privacy/notebooks/ai4p_adapter_en/pytorch_adapter.bin", config=config, num_labels=len(id2label), id2label=id2label, label2id=label2id)


In [None]:
text = "My name is Sarah and I live in Berkeley, California."
inputs = tokenizer(text, return_tensors="pt")

In [None]:
outputs = base_model(**inputs)

In [None]:
outputs

In [None]:
id2label =  {
    0: "O",
    1: "B-PHONEIMEI",
    2: "I-PHONEIMEI",
    3: "B-JOBAREA",
    4: "B-FIRSTNAME",
    5: "I-FIRSTNAME",
    6: "B-VEHICLEVIN",
    7: "I-VEHICLEVIN",
    8: "B-AGE",
    9: "B-GENDER",
    10: "I-GENDER",
    11: "B-HEIGHT",
    12: "I-HEIGHT",
    13: "B-BUILDINGNUMBER",
    14: "I-BUILDINGNUMBER",
    15: "B-MASKEDNUMBER",
    16: "I-MASKEDNUMBER",
    17: "B-PASSWORD",
    18: "I-PASSWORD",
    19: "B-DOB",
    20: "I-DOB",
    21: "B-IPV6",
    22: "I-IPV6",
    23: "B-NEARBYGPSCOORDINATE",
    24: "I-NEARBYGPSCOORDINATE",
    25: "B-USERAGENT",
    26: "I-USERAGENT",
    27: "B-TIME",
    28: "I-TIME",
    29: "B-JOBTITLE",
    30: "I-JOBTITLE",
    31: "B-COUNTY",
    32: "B-EMAIL",
    33: "I-EMAIL",
    34: "B-ACCOUNTNUMBER",
    35: "I-ACCOUNTNUMBER",
    36: "B-PIN",
    37: "I-PIN",
    38: "B-EYECOLOR",
    39: "I-EYECOLOR",
    40: "B-LASTNAME",
    41: "I-LASTNAME",
    42: "I-JOBAREA",
    43: "B-IPV4",
    44: "I-IPV4",
    45: "B-DATE",
    46: "I-DATE",
    47: "B-STREET",
    48: "I-STREET",
    49: "B-CITY",
    50: "I-CITY",
    51: "B-PREFIX",
    52: "I-PREFIX",
    53: "B-CREDITCARDISSUER",
    54: "B-CREDITCARDNUMBER",
    55: "I-CREDITCARDNUMBER",
    56: "I-CREDITCARDISSUER",
    57: "B-MIDDLENAME",
    58: "B-STATE",
    59: "I-STATE",
    60: "B-VEHICLEVRM",
    61: "I-VEHICLEVRM",
    62: "B-ORDINALDIRECTION",
    63: "B-SEX",
    64: "B-JOBTYPE",
    65: "I-JOBTYPE",
    66: "B-CURRENCYCODE",
    67: "I-CURRENCYCODE",
    68: "B-CURRENCYSYMBOL",
    69: "I-AMOUNT",
    70: "B-ACCOUNTNAME",
    71: "I-ACCOUNTNAME",
    72: "B-BITCOINADDRESS",
    73: "I-BITCOINADDRESS",
    74: "B-LITECOINADDRESS",
    75: "I-LITECOINADDRESS",
    76: "B-PHONENUMBER",
    77: "I-PHONENUMBER",
    78: "B-MAC",
    79: "I-MAC",
    80: "B-CURRENCY",
    81: "B-IBAN",
    82: "I-IBAN",
    83: "B-COMPANYNAME",
    84: "I-COMPANYNAME",
    85: "B-CURRENCYNAME",
    86: "I-CURRENCYNAME",
    87: "I-CURRENCYSYMBOL",
    88: "B-ZIPCODE",
    89: "I-ZIPCODE",
    90: "B-SSN",
    91: "I-SSN",
    92: "B-AMOUNT",
    93: "I-CURRENCY",
    94: "B-URL",
    95: "I-URL",
    96: "B-IP",
    97: "I-IP",
    98: "B-SECONDARYADDRESS",
    99: "I-SECONDARYADDRESS",
    100: "B-USERNAME",
    101: "I-USERNAME",
    102: "B-ETHEREUMADDRESS",
    103: "I-ETHEREUMADDRESS",
    104: "B-CREDITCARDCVV",
    105: "I-CREDITCARDCVV",
    106: "I-COUNTY",
    107: "I-AGE",
    108: "I-MIDDLENAME",
    109: "B-BIC",
    110: "I-BIC"
}

In [None]:
import torch

def analyze_sentence(sentence):
  tokens = tokenizer.tokenize(sentence)
  inputs = tokenizer(text, return_tensors="pt")
  outputs = base_model(**inputs)

  # Post-process NER output en
  # ner_labels_map = base_model.get_labels_dict(peft_model_en)
  ner_label_ids = torch.argmax(outputs[0], dim=2).numpy().squeeze().tolist()
  ner_labels = [id2label[id_] for id_ in ner_label_ids]
  annotated = []
  for token, label_id in zip(tokens, ner_label_ids):
    token = token.replace('\u0120', '')
    label = id2label[label_id]
    annotated.append(f"{token}<{label}>" if label != "O" else token)
  print("NER EN: " + " ".join(annotated))

  # Post-process NER output it
  # ner_labels_map = base_model.get_labels_dict(peft_model_it)
  ner_label_ids = torch.argmax(outputs[1], dim=2).numpy().squeeze().tolist()
  ner_labels = [id2label[id_] for id_ in ner_label_ids]
  annotated = []
  for token, label_id in zip(tokens, ner_label_ids):
    token = token.replace('\u0120', '')
    label = id2label[label_id]
    annotated.append(f"{token}<{label}>" if label != "O" else token)
  print("NER IT: " + " ".join(annotated))


In [None]:
sentences = [
  "A man in central Germany tried to leave his house by the front door only to find a brick wall there.",
  "Il Met Office ha emesso un'allerta meteo gialla per ghiaccio su gran parte del Galles.",
  "A vibrant animation telling stories of indigenous Australia will be projected on to the Sydney Opera House every night at sunset.",
  "Ogni sera, al tramonto, sulla Sydney Opera House verrà proiettata una vivace animazione che racconta le storie degli indigeni australiani."
]

for sentence in sentences:
  analyze_sentence(sentence)

In [None]:
ad_model = AutoAdapterModel.from_pretrained(base_model_name_or_path, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

In [None]:
inference_model = AutoModelForTokenClassification.from_pretrained(
    base_model_name_or_path, num_labels=len(id2label), id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path, token="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")

In [None]:
from peft import LoraConfig, TaskType 

peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=320, lora_alpha=384, lora_dropout=0.1, bias="all",
    target_modules=["q_lin","k_lin","v_lin", "out_lin"]
)

In [None]:
# architecture = "lora"
# selfattn_lora = True
# intermediate_lora = False
# output_lora = False
# r = 320
# alpha = 384
# attn_matrices = =["q_lin","k_lin","v_lin", "out_lin"]

In [None]:
from adapters import LoRAConfig

config = LoRAConfig(architecture = "lora", r=320, alpha=384, attn_matrices=["q_lin","k_lin","v_lin", "out_lin"], )
ad_model.add_adapter("lora_adapter", config=config, model=inference_model)

## Dataset Preparation

In [None]:
def split_list(input_list):
    """Splits a list into sublists based on non-zero integers, preserving their positions.

    Args:
        input_list: The input list containing integers.

    Returns:
        A list of sublists, where each sublist contains a non-zero integer at its original position and zeros elsewhere.
    """

    # 1. Count the number of non-zero integers:
    num_non_zeros = sum(x != 0 for x in input_list)

    # 2. Initialize sublists with the original list size, filled with zeros:
    sublists = [ [0] * len(input_list) for _ in range(num_non_zeros) ]

    # 3. Iterate through the input list and populate sublists:
    sublist_index = 0
    for i, value in enumerate(input_list):
        if value != 0:
            sublists[sublist_index][i] = value
            sublist_index += 1
            if i+1 != 0:
                sublists[sublist_index][i] = value
                sublist_index += 1

    return sublists

In [None]:
data = [0, 3, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sublists = split_list(data)

In [None]:
len(sublists[0]), len(data)

In [None]:
labels = list(set([x.replace("B-","").replace("I-", "") for x in list(id2label.values())]))

In [None]:
with open ("labels.txt", "w") as f:
  for label in labels:
    f.write(label + "\n")

In [None]:
# architecture: Optional[str] = "lora"

# selfattn_lora: bool = True
# intermediate_lora: bool = False
# output_lora: bool = False
# leave_out: List[int] = field(default_factory=list)
# r: int = 8
# alpha: int = 8
# dropout: float = 0.0
# attn_matrices: List[str] = field(default_factory=lambda: ["q", "v"])
# composition_mode: str = "add"
# init_weights: str = "lora"
# use_gating: bool = False

In [None]:
ad_config = {
  "config": {
    "adapter_residual_before_ln": False,
    "cross_adapter": False,
    "inv_adapter": None,
    "inv_adapter_reduction_factor": None,
    "leave_out": [],
    "ln_after": False,
    "ln_before": False,
    "mh_adapter": False,
    "non_linearity": "relu",
    "original_ln_after": True,
    "original_ln_before": True,
    "output_adapter": True,
    "reduction_factor": 320,
    "residual_before_ln": True,
    "architecture":
  },
  "hidden_size": 768,
  "model_class": "BertModelWithHeads",
  "model_name": "bert-base-uncased",
  "model_type": "bert",
  "name": "conll2003_ner"
}