In [1]:
import numpy as np
import pandas as pd

import spacy
from spacy.training import offsets_to_biluo_tags
nlp = spacy.load("en_core_web_lg")

from tqdm import trange
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

from seqeval.metrics import classification_report, accuracy_score, f1_score




In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
doc
for token in doc:
    print(token.text, token.pos_, token.dep_)


Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [3]:
# Adding '\n' to the default spacy tokenizer
prefixes = ('\n', ) + tuple(nlp.Defaults.prefixes)
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search


In [4]:
# Personal Custom Tags Dictionary
entity_dict = {
    'Name': 'NAME', 
    'College Name': 'CLG',
    'Degree': 'DEG',
    'Graduation Year': 'GRADYEAR',
    'Years of Experience': 'YOE',
    'Companies worked at': 'COMPANY',
    'Designation': 'DESIG',
    'Skills': 'SKILLS',
    'Location': 'LOC',
    'Email Address': 'EMAIL'
}

In [5]:
with open('./data.json', 'r', encoding='utf-8') as file:
    for i in range(5):  # Print the first 5 lines
        print(file.readline())


{"content": "Afreen Jamadar\nActive member of IIIT Committee in Third year\n\nSangli, Maharashtra - Email me on Indeed: indeed.com/r/Afreen-Jamadar/8baf379b705e37c6\n\nI wish to use my knowledge, skills and conceptual understanding to create excellent team\nenvironments and work consistently achieving organization objectives believes in taking initiative\nand work to excellence in my work.\n\nWORK EXPERIENCE\n\nActive member of IIIT Committee in Third year\n\nCisco Networking -  Kanpur, Uttar Pradesh\n\norganized by Techkriti IIT Kanpur and Azure Skynet.\nPERSONALLITY TRAITS:\n• Quick learning ability\n• hard working\n\nEDUCATION\n\nPG-DAC\n\nCDAC ACTS\n\n2017\n\nBachelor of Engg in Information Technology\n\nShivaji University Kolhapur -  Kolhapur, Maharashtra\n\n2016\n\nSKILLS\n\nDatabase (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTECHNICAL SKILLS:\n\n• 

In [6]:
# loading the dataset
df = pd.read_json('./data.json', lines=True)
df.head()

Unnamed: 0,content,annotation,extras,metadata
0,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",,"{'first_done_at': 1527844872000, 'last_updated..."
1,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",,"{'first_done_at': 1527845028000, 'last_updated..."
2,Anvitha Rao\nAutomation developer\n\n- Email m...,"[{'label': ['Links'], 'points': [{'start': 288...",,"{'first_done_at': 1527744637000, 'last_updated..."
3,arjun ks\nSenior Program coordinator - oracle ...,"[{'label': ['Skills'], 'points': [{'start': 50...",,"{'first_done_at': 1527834843000, 'last_updated..."
4,"Arun Elumalai\nQA Tester\n\nChennai, Tamil Nad...","[{'label': ['Skills'], 'points': [{'start': 19...",,"{'first_done_at': 1527847268000, 'last_updated..."


In [7]:
# Since, 'extras' column contains no information we can drop the column
df = df.drop(['extras'], axis=1)
df.head()

Unnamed: 0,content,annotation,metadata
0,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...","{'first_done_at': 1527844872000, 'last_updated..."
1,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...","{'first_done_at': 1527845028000, 'last_updated..."
2,Anvitha Rao\nAutomation developer\n\n- Email m...,"[{'label': ['Links'], 'points': [{'start': 288...","{'first_done_at': 1527744637000, 'last_updated..."
3,arjun ks\nSenior Program coordinator - oracle ...,"[{'label': ['Skills'], 'points': [{'start': 50...","{'first_done_at': 1527834843000, 'last_updated..."
4,"Arun Elumalai\nQA Tester\n\nChennai, Tamil Nad...","[{'label': ['Skills'], 'points': [{'start': 19...","{'first_done_at': 1527847268000, 'last_updated..."


In [8]:
# Since, 'metadata' column contains no information we can drop the column
df = df.drop(['metadata'], axis=1)
df.head()

Unnamed: 0,content,annotation
0,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta..."
1,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80..."
2,Anvitha Rao\nAutomation developer\n\n- Email m...,"[{'label': ['Links'], 'points': [{'start': 288..."
3,arjun ks\nSenior Program coordinator - oracle ...,"[{'label': ['Skills'], 'points': [{'start': 50..."
4,"Arun Elumalai\nQA Tester\n\nChennai, Tamil Nad...","[{'label': ['Skills'], 'points': [{'start': 19..."


In [9]:
def mergeIntervals(intervals):
    sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
    merged = []

    for higher in sorted_by_lower_bound:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:
                if lower[2] is higher[2]:
                    upper_bound = max(lower[1], higher[1])
                    merged[-1] = (lower[0], upper_bound, lower[2])
                else:
                    if lower[1] > higher[1]:
                        merged[-1] = lower
                    else:
                        merged[-1] = (lower[0], higher[1], higher[2])
            else:
                merged.append(higher)

    return merged

In [10]:
#intervals = [(1, 3), (2, 6), (8, 10), (15, 18), (17, 20)]

# After merging intervals, the expected result should be:
# [(1, 6), (8, 10), (15, 20)]

notre data est un dictionnaire
qui contient content et annotation
Annotations est une liste qui contient des dictionnaire (chaque dictionnaire est construit par label et points)
points est une liste (je sais pas pourquoi car elle contient 1 seul element) de dictionnaire (start , end , text)

In [11]:
# From 'annotation' column, we are extracting the starting index, ending index, entity label
# So that we can convert the content in BILOU format
#he4i l5edma 3ala ligne bark

def get_entities(df):
    
    entities = []
    
    for i in range(len(df)):
        #nboucliw 3al data lkoll
        entity = []
        if df['annotation'][i] is not None:  # Check if annotation is not None
            for annot in df['annotation'][i]:
                #he4i l5edma 3ala ligne bark (df['annotation'][i] he4a dictionnaire)
                try:
                    ent = entity_dict[annot['label'][0]]
                    start = annot['points'][0]['start']
                    end = annot['points'][0]['end'] + 1
                    entity.append((start, end, ent))
                except:
                    pass
        
            entity = mergeIntervals(entity)
            entities.append(entity)
        else :
            entities.append([])
        
    return entities

In [12]:
# Adding a new column 'entities'
df['entities'] = get_entities(df)
df.head()

Unnamed: 0,content,annotation,entities
0,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...","[(0, 14, NAME), (62, 68, LOC), (104, 148, EMAI..."
1,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...","[(0, 12, NAME), (13, 51, DESIG), (54, 60, COMP..."
2,Anvitha Rao\nAutomation developer\n\n- Email m...,"[{'label': ['Links'], 'points': [{'start': 288...","[(0, 11, NAME), (12, 32, DESIG), (56, 97, EMAI..."
3,arjun ks\nSenior Program coordinator - oracle ...,"[{'label': ['Skills'], 'points': [{'start': 50...","[(0, 8, NAME), (9, 35, DESIG), (38, 58, COMPAN..."
4,"Arun Elumalai\nQA Tester\n\nChennai, Tamil Nad...","[{'label': ['Skills'], 'points': [{'start': 19...","[(0, 13, NAME), (14, 24, DESIG), (25, 32, LOC)..."


In [13]:
def get_train_data(df):
    tags = []
    sentences = []

    for i in range(len(df)):
        text = df['content'][i]
        entities = df['entities'][i]
    
        doc = nlp(text)
        #B- (Beginning): The first token of a multi-token entity.
        #I- (Inside): Any token inside a multi-token entity other than the first and last tokens.
        #L- (Last): The last token of a multi-token entity.
        #U- (Unit): A single-token entity.
        #O: Tokens that are not part of any entity.
        tag = offsets_to_biluo_tags(doc, entities)
        tmp = pd.DataFrame([list(doc), tag]).T
        loc = []
        for i in range(len(tmp)):
            if tmp[0][i].text == '.' and tmp[1][i] == 'O':
                loc.append(i)
        loc.append(len(doc))
        #loc fih les positions mta3 el endig mta3 koll jomla
    
        last = 0
        data = []
        
        for pos in loc:
            data.append([list(doc)[last:pos], tag[last:pos]])
            last = pos
        #list(doc)[last:pos] retrieves the tokens from the document doc starting from the index last up to (but not including) the index pos, creating a segment of text.
        #tag[last:pos] retrieves the BILOU tags corresponding to the tokens in the segment of text.
        
        for d in data:
            tag = ['O' if t == '-' else t for t in d[1]]
            if len(set(tag)) > 1:
                sentences.append(d[0])
                tags.append(tag)
    
    return sentences, tags

In [14]:
#sentences hia liste mta3 sentence metkawwna men (tokens) and each token has correspondig tag
#koll manel9aw point "." na3mlou sentence jdida
sentences, tags = get_train_data(df)
len(sentences), len(tags)

Active member of IIIT Committee in ..." with entities "[(0, 14, 'NAME'), (62, 68, 'LOC'), (104, 148, 'EMA...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Operational Analyst (SQL DBA) Enginee..." with entities "[(0, 12, 'NAME'), (13, 51, 'DESIG'), (54, 60, 'COM...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Automation developer

- Email me on In..." with entities "[(0, 11, 'NAME'), (12, 32, 'DESIG'), (56, 97, 'EMA...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Senior Program coordinator - oracle India..." with entities "[(0, 8, 'NAME'), (9, 35, 'DESIG'), (38, 58, 'COMPA...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to c

(3108, 3108)

In [15]:

# Print first few sentences and their corresponding tags
for sent, tag in zip(sentences[:2], tags[:2]):
    print("Sentence:", sent)
    print("Tags:", tag)
    print()


Sentence: [Afreen, Jamadar, 
, Active, member, of, IIIT, Committee, in, Third, year, 

, Sangli, ,, Maharashtra, -, Email, me, on, Indeed, :, indeed.com/r/Afreen-Jamadar/8baf379b705e37c6, 

, I, wish, to, use, my, knowledge, ,, skills, and, conceptual, understanding, to, create, excellent, team, 
, environments, and, work, consistently, achieving, organization, objectives, believes, in, taking, initiative, 
, and, work, to, excellence, in, my, work]
Tags: ['B-NAME', 'L-NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-EMAIL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: [., 

, WORK, EXPERIENCE, 

, Active, member, of, IIIT, Committee, in, Third, year, 

, Cisco, Networking, -,  , Kanpur, ,, Uttar, Pradesh, 

, organized, by, Techkriti, IIT, Kanpur, and, Azure, Skynet]
Tags: ['O', 'O'

In [16]:
#bech nzidou 3tags 
#'X': This tag often represents tokens that are not part of any named entity or class. In other words, it's a placeholder tag for tokens that don't fit into any predefined category.
#[CLS]: This token is a special symbol added to the beginning of input sequences.
#[SEP]: This token is used to separate segments or sentences in the input sequence. 
tag_vals = set(['X', '[CLS]', '[SEP]'])
for i in range(len(tags)):
    tag_vals = tag_vals.union(tags[i])
tag_vals

{'B-CLG',
 'B-COMPANY',
 'B-DEG',
 'B-DESIG',
 'B-EMAIL',
 'B-GRADYEAR',
 'B-LOC',
 'B-NAME',
 'B-SKILLS',
 'B-YOE',
 'I-CLG',
 'I-COMPANY',
 'I-DEG',
 'I-DESIG',
 'I-EMAIL',
 'I-GRADYEAR',
 'I-LOC',
 'I-NAME',
 'I-SKILLS',
 'I-YOE',
 'L-CLG',
 'L-COMPANY',
 'L-DEG',
 'L-DESIG',
 'L-EMAIL',
 'L-GRADYEAR',
 'L-LOC',
 'L-NAME',
 'L-SKILLS',
 'L-YOE',
 'O',
 'U-CLG',
 'U-COMPANY',
 'U-DEG',
 'U-DESIG',
 'U-EMAIL',
 'U-GRADYEAR',
 'U-LOC',
 'U-NAME',
 'U-SKILLS',
 'U-YOE',
 'X',
 '[CLS]',
 '[SEP]'}

In [17]:
tag2idx = {t: i for i, t in enumerate(tag_vals)}
tag2idx

{'B-CLG': 0,
 'X': 1,
 'L-NAME': 2,
 'B-COMPANY': 3,
 'L-CLG': 4,
 'U-SKILLS': 5,
 'B-DEG': 6,
 'L-EMAIL': 7,
 'I-EMAIL': 8,
 'B-YOE': 9,
 'L-SKILLS': 10,
 'U-CLG': 11,
 'I-LOC': 12,
 'I-NAME': 13,
 'I-SKILLS': 14,
 'U-COMPANY': 15,
 'B-SKILLS': 16,
 'I-YOE': 17,
 '[CLS]': 18,
 'U-GRADYEAR': 19,
 'U-YOE': 20,
 'L-COMPANY': 21,
 'B-DESIG': 22,
 'U-EMAIL': 23,
 'L-GRADYEAR': 24,
 'O': 25,
 'I-DEG': 26,
 'U-NAME': 27,
 'B-EMAIL': 28,
 'B-NAME': 29,
 'L-DESIG': 30,
 'I-GRADYEAR': 31,
 'I-DESIG': 32,
 'L-DEG': 33,
 'B-LOC': 34,
 'U-DEG': 35,
 'L-YOE': 36,
 'U-DESIG': 37,
 'B-GRADYEAR': 38,
 '[SEP]': 39,
 'I-COMPANY': 40,
 'L-LOC': 41,
 'I-CLG': 42,
 'U-LOC': 43}

In [18]:
idx2tag = {tag2idx[key] : key for key in tag2idx.keys()}
idx2tag

{0: 'B-CLG',
 1: 'X',
 2: 'L-NAME',
 3: 'B-COMPANY',
 4: 'L-CLG',
 5: 'U-SKILLS',
 6: 'B-DEG',
 7: 'L-EMAIL',
 8: 'I-EMAIL',
 9: 'B-YOE',
 10: 'L-SKILLS',
 11: 'U-CLG',
 12: 'I-LOC',
 13: 'I-NAME',
 14: 'I-SKILLS',
 15: 'U-COMPANY',
 16: 'B-SKILLS',
 17: 'I-YOE',
 18: '[CLS]',
 19: 'U-GRADYEAR',
 20: 'U-YOE',
 21: 'L-COMPANY',
 22: 'B-DESIG',
 23: 'U-EMAIL',
 24: 'L-GRADYEAR',
 25: 'O',
 26: 'I-DEG',
 27: 'U-NAME',
 28: 'B-EMAIL',
 29: 'B-NAME',
 30: 'L-DESIG',
 31: 'I-GRADYEAR',
 32: 'I-DESIG',
 33: 'L-DEG',
 34: 'B-LOC',
 35: 'U-DEG',
 36: 'L-YOE',
 37: 'U-DESIG',
 38: 'B-GRADYEAR',
 39: '[SEP]',
 40: 'I-COMPANY',
 41: 'L-LOC',
 42: 'I-CLG',
 43: 'U-LOC'}

In [19]:
#This code snippet sets up PyTorch to utilize GPU (CUDA) if it's available, 
#and it also determines the number of available GPUs.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [21]:
#bech na3mlou tokenization lkoll word
def get_tokenized_train_data(sentences, tags):

    tokenized_texts = []
    word_piece_labels = []

    #word_list hia each sentence (EX: Sentence: [Afreen, Jamadar, , Active, member, of, IIIT, Committee, in, Third, year] )
    for word_list, label in zip(sentences, tags):
    
        # Add [CLS] at the front
        temp_lable = ['[CLS]']
        temp_token = ['[CLS]']
    
        for word, lab in zip(word_list, label):
            token_list = tokenizer.tokenize(word.text) #koll kelma bech nzidou na9smou lsubwords (tokenization)
            for m, token in enumerate(token_list):
                temp_token.append(token)
                if m == 0:
                    temp_lable.append(lab)   #i4a ken awwel subword n7ottolha el tag mte3ha
                else:
                    temp_lable.append('X')   #b9iet les subwords na3tiwhom tag X
                
        # Add [SEP] at the end
        temp_lable.append('[SEP]')
        temp_token.append('[SEP]')
    
        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)
    
    return tokenized_texts, word_piece_labels

In [22]:
#tokenized_texts hia liste de liste koll element fiha liste fih les tokens mta3 koll jomla 
tokenized_texts, word_piece_labels = get_tokenized_train_data(sentences, tags)
len(tokenized_texts[0]),len(tokenized_texts)

(87, 3108)

In [23]:
print(tokenized_texts[0])
print(word_piece_labels[0])

['[CLS]', 'A', '##free', '##n', 'Jam', '##ada', '##r', 'Active', 'member', 'of', 'III', '##T', 'Committee', 'in', 'Third', 'year', 'Sang', '##li', ',', 'Maharashtra', '-', 'Em', '##ail', 'me', 'on', 'Indeed', ':', 'indeed', '.', 'com', '/', 'r', '/', 'A', '##free', '##n', '-', 'Jam', '##ada', '##r', '/', '8', '##ba', '##f', '##37', '##9', '##b', '##70', '##5', '##e', '##37', '##c', '##6', 'I', 'wish', 'to', 'use', 'my', 'knowledge', ',', 'skills', 'and', 'conceptual', 'understanding', 'to', 'create', 'excellent', 'team', 'environments', 'and', 'work', 'consistently', 'achieving', 'organization', 'objectives', 'believes', 'in', 'taking', 'initiative', 'and', 'work', 'to', 'excellence', 'in', 'my', 'work', '[SEP]']
['[CLS]', 'B-NAME', 'X', 'X', 'L-NAME', 'X', 'X', 'O', 'O', 'O', 'O', 'X', 'O', 'O', 'O', 'O', 'U-LOC', 'X', 'O', 'O', 'O', 'O', 'X', 'O', 'O', 'O', 'O', 'U-EMAIL', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X

In [24]:
#matrice mta3 tokens_ids
MAX_LEN = 512 #MAX_LEN = 512: This variable specifies the maximum length of the input sequences. Any sequences longer than this length will be truncated, 
                              #and any sequences shorter than this length will be padded.
bs = 8  #batch size
#Each token in the txt is mapped to its unique integer ID.
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(len(input_ids[0]))
print(input_ids[0])

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (679 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (977 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (567 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1054 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1231 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length

512
[  101   138 26743  1179 13263  7971  1197 17244  1420  1104  2684  1942
  2341  1107  4180  1214 22409  2646   117 12626   118 18653 11922  1143
  1113 10364   131  5750   119  3254   120   187   120   138 26743  1179
   118 13263  7971  1197   120   129  2822  2087 26303  1580  1830 20829
  1571  1162 26303  1665  1545   146  3683  1106  1329  1139  3044   117
  4196  1105 20046  4287  1106  2561  6548  1264 10152  1105  1250 10887
 11190  2369 11350  6616  1107  1781  7191  1105  1250  1106 14509  1107
  1139  1250   102     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     

In [25]:
#matrice mta3 tags mta3 koll sentence (hna les id a7na 3tinahom bel fct tag2idx)
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels], maxlen=MAX_LEN, value=tag2idx["O"], 
                     padding="post", dtype="long", truncating="post")
print(len(tags))
print(len(tags[0]))
print(tags[0])

3108
512
[18 29  1  1  2  1  1 25 25 25 25  1 25 25 25 25 43  1 25 25 25 25  1 25
 25 25 25 23  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 39 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 25 25 25 25 25 25

In [26]:
#fel matrice mta3 token_ids i4a ken el valeur >0 te5ou 1
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [27]:
#Training w validation lel : input , tags , masks
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state=2020,test_size=0.3)

In [28]:
#By converting the input data, labels, and masks into PyTorch tensors, we ensure that they are in the appropriate format for consumption by PyTorch-based deep learning models, 
#enabling efficient training, validation, and inference processes.
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [29]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) #TensorDataset: TensorDataset is a PyTorch utility class that allows you to create a dataset from a set of tensors. In this case, it's used to combine the input data, attention masks, and tags or labels into a single dataset.
train_sampler = RandomSampler(train_data) #shuffles the data before sampling, which helps in introducing randomness and avoiding bias during training.
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) #DataLoader: DataLoader is another PyTorch utility class that provides an iterable over a dataset. It supports various features like batching, shuffling, and parallel data loading. By using DataLoader, we can efficiently iterate over the dataset in batches during training and validation.

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data) #samples data sequentially without shuffling, which is typically used for validation or testing datasets to ensure reproducible results.
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [30]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx))

In [31]:
#model.cuda(); #bech twalli te5dem 3al gpu

AssertionError: Torch not compiled with CUDA enabled

In [32]:
#L'objectif du Fine-Tuning est alors d'optimiser les performances d'un modèle existant en le ré-entraînant sur des données spécifiques. 
#En ajustant les poids et les paramètres du modèle, cette technique permet à un système d'IA de s'adapter à des tâches plus spécifiques.
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())  #This line retrieves all the parameters (weights and biases) of the model along with their names.
    no_decay = ['bias', 'gamma', 'beta']#This is a list of parameter names for which weight decay will not be applied.
    
    #Parameters specified in no_decay list will have a weight decay rate of 0.0, while the rest will have a weight decay rate of 0.01. 
    #This separation is done to apply different weight decay rates to different sets of parameters.
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [33]:
epochs = 10
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Ensure labels are of type Long
        b_labels = b_labels.long()
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # Extract loss from outputs if using a loss function that returns a tuple
        if isinstance(outputs, tuple):
            loss = outputs[0]
        else:
            loss = outputs
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))


Epoch:   0%|          | 0/10 [36:13<?, ?it/s]


KeyboardInterrupt: 