In [1]:
# adapted from Huggingface Question Answering code

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
import os.path


dir0 = '/content/drive/MyDrive/ML1/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!cd /content/drive/MyDrive/ML1/ && ls

datasheets   datasheets2  master.csv  text_from_pdf.csv  train.csv
datasheets1  __MACOSX	  README.MD   train2.csv


In [4]:
df = pd.read_csv(dir0 + 'text_from_pdf.csv')
df[['Mfr Part Number', 'text']]


Unnamed: 0,Mfr Part Number,text
0,"=""1523502""",IP67\n\nGeneral\nNote\n\n IP54. Influences ari
1,RT06102PNH,IPTION\nRELEASED DRAW IPTION\nPART NUMBER\nM ...
2,EXG.1B.305.HLN,IP rating\n\n50\n\nAWG w
3,S21M0C-P10MCC0-65CS,
4,C10WBM-P09XMM0-0000,"IP6K8 / IP6K9K, 9 CO IP6K9K, 9 CONTACTS, IP6..."
...,...,...
2219,S11MC7-P10MCC0-3970,
2220,"=""1559932""",IP67 (When plugged i IP65 (When plugged i IP5...
2221,G11M07-P02LPH0-0040,
2222,RKC 40/16 single pk of 1,IP 67 / NEMA 6P\n-40°


# Question answering (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [5]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


You will need to setup git, adapt your email and name in the following cell.

In [6]:
!git config --global user.email "jcai@attentivemobile.com"
!git config --global user.name "jcai1"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [7]:
from datasets import Dataset

def to_df(dataset):
  dataset.set_format("pandas")
  df = dataset[:].copy()
  return Dataset.from_pandas(df), df # without change the dataset format


def to_ds(df):
  return Dataset.from_pandas(df)

In [8]:

dir0 = '/content/drive/MyDrive/ML1/'
df = pd.read_csv(dir0 + 'text_from_pdf.csv')

df = df[df['text'].notna()] # need to check if there is text inside images

df['text'] = df['text'].apply(lambda s: s.replace('\n', ' '))

context_col = 'context'
question_col = 'question'
answer_col = 'answers'
answer_text_col = 'answer_text'
answer_start_idx_col = 'answer_start_idx'

train = df[['Mfr Part Number','text', 'IP Rating']].copy()
train.rename(columns={'Mfr Part Number': 'id', 'text': context_col, 'IP Rating': answer_text_col}, inplace=True)
train[question_col] = 'what is the IP code'
train

Unnamed: 0,id,context,answer_text,question
0,"=""1523502""",IP67 General Note IP54. Influences ari,IP67,what is the IP code
1,RT06102PNH,IPTION RELEASED DRAW IPTION PART NUMBER M IP-...,IP67,what is the IP code
2,EXG.1B.305.HLN,IP rating 50 AWG w,IP50,what is the IP code
4,C10WBM-P09XMM0-0000,"IP6K8 / IP6K9K, 9 CO IP6K9K, 9 CONTACTS, IP6...","IP6K8, IP6K9K",what is the IP code
5,RTS716N09P03,IPTION RELEASED DRAW IPTION PART NUMBER M IP-...,IP67,what is the IP code
...,...,...,...,...
2215,"=""1433155""",IP67 protection for IP67 Ambient temper,IP67,what is the IP code
2216,RT06104SNHEC,IPTION RELEASED DRAW IPTION PART NUMBER Q,IP67,what is the IP code
2218,FPG.0B.305.CLAD52,IP rating 50 AWG w,IP50,what is the IP code
2220,"=""1559932""",IP67 (When plugged i IP65 (When plugged i IP5...,"IP65, IP67",what is the IP code


In [9]:
'''
multiple labels
divide the text info multiple text, assign a part number which is used by me
keep it simple, simple item prediction first
 {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
'''
def get_index(row):
  # search1 = row[context_col].find(row[answer_text_col])
  # search2 = row[context_col].find(row[answer_text_col][2:])
  # if search1 == -1 and search2 != -1:
  # if search2 == -1:
  #   return True
  # return False
  return row[context_col].find(row[answer_text_col][2:]) # ignore the first 2 letters in 'IP67'

def format_answer(row):
  start_idx = row[context_col].find(row[answer_text_col][2:])
  if start_idx == -1:
    return None
  return {'text': [row[answer_text_col][2:]], 'answer_start': [start_idx]}

# remove those with multiple answers (about 10%),
#  since the model might predict only one of them
train['answer_len'] = train[answer_text_col].apply(lambda s: len(s.split(',')))
train1 = train[train['answer_len']==1].copy()
train.shape, train1.shape
train1[answer_col] = train1.apply (lambda row: format_answer(row), axis=1)

#
train2 = train1[train1['answers'].notna()]
fname = dir0 + 'train2.csv'
train2[[context_col, question_col, answer_col]][:10].to_csv(fname, index=False)
train2.shape
train2['start_idx'] = train2['answers'].apply(lambda d: d['answer_start'])
train2[['start_idx']].shape
train2 = train2.reset_index()
train3 = train2[['id', 'context', 'question', 'answers']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train2['start_idx'] = train2['answers'].apply(lambda d: d['answer_start'])


In [10]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset['train'] = Dataset.from_pandas(train3)
dataset


DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1179
    })
})

In [11]:
# reading from a file, the column will be str instead of a dictionary.
# dataset = load_dataset("csv", data_files=fname)
# dataset


In [12]:
def split_into_train_validation(dataset):
  train_validation_dataset = dataset["train"].train_test_split(train_size=0.8, seed=42)
  train_validation_dataset["validation"] = train_validation_dataset.pop("test")
  return train_validation_dataset

def split_into_train_validation_test(dataset):
  train_test_dataset = dataset["train"].train_test_split(train_size=0.8, seed=42)
  train_validation_test_dataset = train_test_dataset["train"].train_test_split(train_size=0.8, seed=42)

  # Rename the default "test" split to "validation"
  train_validation_test_dataset["validation"] = train_validation_test_dataset.pop("test")
  # Add the "test" set to our `DatasetDict`
  train_validation_test_dataset["test"] = train_test_dataset["test"]
  return train_validation_test_dataset


In [13]:

raw_datasets = split_into_train_validation(dataset)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 943
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 236
    })
})

In [14]:
# from datasets import load_dataset, DatasetDict

# raw_datasets = load_dataset("squad")
# raw_datasets

In [15]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased" # start from this model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
max_length = 384
stride = 128

def preprocess_training_examples(examples):
    # print(examples)
    # for exa in examples['context']:
    #   print(f'==============context={exa}')
    # for exa in examples['question']:
    #   print(f'question={exa}')
    # for exa in examples['answers']:
    #   print(f'answers={exa}')
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second", # truncate only context, not question
        stride=stride,
        return_overflowing_tokens=True, # indicate which data example it is
        return_offsets_mapping=True, # indicate this token is for which word
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping") # mapping of word and token
    sample_map = inputs.pop("overflow_to_sample_mapping") # don't need it for model
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        #print(f'i={i}, offset={offset}')
        sample_idx = sample_map[i] # which data example
        #print(f'sample_idx={sample_idx}')
        answer = answers[sample_idx]
        #print(f'answer={answer}')

        #answer['text'] = 'IP' + answer.pop('text')

        #print(f'answer={answer}, {type(answer)}')
        #print(f'answer["answer_start"]={answer["answer_start"]}')

        #print(f'start_char will be set to {answer["answer_start"][0]}')
        #break
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i) # question or context

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [17]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/943 [00:00<?, ? examples/s]

(943, 971)

In [18]:
raw_datasets["train"], df = to_df(raw_datasets["train"])
df

Unnamed: 0,id,context,question,answers
0,AHDBM04-24-91PN-059,IPTION DATE BY AP IP68(IM OF WATER FOR IP6...,what is the IP code,"{'answer_start': [45], 'text': ['69K']}"
1,MB6CG-S3,IPTION RELEASE NEW D IP ø26.3 NOTES: UN IP6...,what is the IP code,"{'answer_start': [45], 'text': ['67']}"
2,FGG.2B.316.CLAD99,IP rating 50 AWG w,what is the IP code,"{'answer_start': [12], 'text': ['50']}"
3,"=""1202480086""",IP DEGREE OF PROTECT IP 67 -25°C to + 90° IPT...,what is the IP code,"{'answer_start': [25], 'text': ['67']}"
4,T 3635 001,IP degree Insertion IP 40 Ag 2 (Pin 1+3,what is the IP code,"{'answer_start': [25], 'text': ['40']}"
...,...,...,...,...
938,C091 31G003 200 5 U,IP degree Insertion IP 68 (in mated cond,what is the IP code,"{'answer_start': [25], 'text': ['68']}"
939,T4144035081-000,IPTION A 5 4 A1 IP67 -40~85 $ C II IP6...,what is the IP code,"{'answer_start': [24], 'text': ['67']}"
940,FGG.0B.722.DN,IP rating 50 AWG w,what is the IP code,"{'answer_start': [12], 'text': ['50']}"
941,FGG.0B.554.ZZC,IP rating 50 AWG w,what is the IP code,"{'answer_start': [12], 'text': ['50']}"


In [19]:
train_dataset, train_df = to_df(train_dataset)
train_df

Unnamed: 0,input_ids,token_type_ids,attention_mask,start_positions,end_positions
0,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",29,31
1,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",28,29
2,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",9,9
3,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",17,17
4,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",13,13
...,...,...,...,...,...
966,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",13,13
967,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",15,16
968,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",9,9
969,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",9,9


In [20]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [21]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/236 [00:00<?, ? examples/s]

(236, 240)

In [22]:
raw_datasets["validation"], raw_val_df = to_df(raw_datasets["validation"])
raw_val_df

Unnamed: 0,id,context,question,answers
0,S10LAC-P04MFG0-3200,IP 50 2 2.5 2 Is Protection class: im,what is the IP code,"{'answer_start': [4], 'text': ['50']}"
1,"=""643712100004""",IPTION Würth Elektr IP68 Electrical Pro IPT...,what is the IP code,"{'answer_start': [24], 'text': ['68']}"
2,K20LAC-P03LJG0-520S,IP 50 2 0.9 mm 2 0 Protection class: im,what is the IP code,"{'answer_start': [4], 'text': ['50']}"
3,RTS6BS12N2P03,IPTION AND VOLTAGE IP-CLASS： IP67 AND I IP6...,what is the IP code,"{'answer_start': [34], 'text': ['67']}"
4,T 3260 001,IP degree Insertion IP 40 Ag 2 (Pin 1+3,what is the IP code,"{'answer_start': [25], 'text': ['40']}"
...,...,...,...,...
231,MB7CKN0600-S3-KIT,IPTION DATE BY AP IP67 ( MATED CONDITI IPT...,what is the IP code,"{'answer_start': [24], 'text': ['67']}"
232,RT072028SNH,IPTION DATE BY AP IP--CLASS：IP67. 2.7 IP6...,what is the IP code,"{'answer_start': [34], 'text': ['67']}"
233,RTS6BS16N9PHEC03,IPTION DATE BY AP IP-CLASS：IP67 AND IP IP6...,what is the IP code,"{'answer_start': [33], 'text': ['67']}"
234,MB1CKN0600,IPTION DATE BY AP IP67 ( MATED CONDITI IPT...,what is the IP code,"{'answer_start': [24], 'text': ['67']}"


In [23]:
validation_dataset, val_df = to_df(validation_dataset)
val_df

Unnamed: 0,input_ids,token_type_ids,attention_mask,offset_mapping,example_id
0,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",S10LAC-P04MFG0-3200
1,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...","=""643712100004"""
2,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",K20LAC-P03LJG0-520S
3,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",RTS6BS12N2P03
4,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",T 3260 001
...,...,...,...,...,...
235,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",MB7CKN0600-S3-KIT
236,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",RT072028SNH
237,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",RTS6BS16N9PHEC03
238,"[101, 1184, 1110, 1103, 14274, 3463, 102, 1427...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[None, None, None, None, None, None, None, [1,...",MB1CKN0600


In [24]:
import torch
from transformers import AutoModelForQuestionAnswering
import collections
import numpy as np
import evaluate

n_best = 20
max_answer_length = 30
predicted_answers = []


In [25]:
metric = evaluate.load("squad")

In [26]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [27]:
from huggingface_hub import notebook_login

notebook_login()
#

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
from transformers import TrainingArguments


model = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased")

args = TrainingArguments(
    output_dir="ip_rating",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# try to reduce training dataset to only 3 examples and see how it impacts performance
train_dataset


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 971
})

In [30]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=366, training_loss=0.4145439022877177, metrics={'train_runtime': 120.4205, 'train_samples_per_second': 24.19, 'train_steps_per_second': 3.039, 'total_flos': 570868089278976.0, 'train_loss': 0.4145439022877177, 'epoch': 3.0})

In [31]:
validation_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 240
})

In [32]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])

#{'exact_match': 98.30508474576271, 'f1': 98.30508474576271} # further finetuned
# {'exact_match': 98.30508474576271, 'f1': 98.30508474576271}

# {'exact_match': 99.15254237288136, 'f1': 98.72881355932203} # bert
# {'exact_match': 98.72881355932203, 'f1': 98.30508474576271}

  0%|          | 0/236 [00:00<?, ?it/s]

{'exact_match': 97.88135593220339, 'f1': 97.88135593220339}

In [33]:
trainer.push_to_hub(commit_message="ip_rating")

'https://huggingface.co/jennyc/ip_rating/tree/main/'

In [34]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8
)

In [35]:
!pip install gradio



In [36]:
!sudo apt-get update
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install pdftotext

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 338 kB in 3s (113 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9u

In [37]:
import pdftotext
import re


def find_relevant_string(text, patterns=['IP', 'Ingress', 'Protection', 'International'], length=50):
  res = ''
  for pattern in patterns:
    idx = text.find(pattern)
    indexes = [m.start() for m in re.finditer(pattern, text)]
    for idx in indexes:
      res += ' ' + text[idx: idx+20]
  return res

def read_through_pdftotext(fname):
  with open(fname, "rb") as f:
      pdf = pdftotext.PDF(f)

  # Read all the text into one string
  text = "  ".join(pdf)
  relevant_text = find_relevant_string(text)
  return relevant_text

In [38]:

#!cd /content/drive/MyDrive/ML1/datasheets && ls

In [39]:
from transformers import pipeline

def infer_from_context(context):
  model_checkpoint = "jennyc/ip_rating"
  question_answerer = pipeline("question-answering", model=model_checkpoint)

  question = "What is the IP code?"
  res = question_answerer(question=question, context=context)
  return res['answer']

def pdf_to_ip_rating(fname):
  context = read_through_pdftotext(fname)
  return infer_from_context(context)

In [43]:
# test inference code
pdf_fname = '/content/drive/MyDrive/ML1/datasheets/utgx0005ac-295338215.pdf'
pdf_to_ip_rating(pdf_fname)

'IP68'

In [41]:
def ip_rating(input_file_obj):
  fname = input_file_obj.name
  return pdf_to_ip_rating(fname)

In [42]:
import gradio as gr

demo = gr.Interface(ip_rating, "file", "text")
demo.launch(debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Keyboard interruption in main thread... closing server.


