In [1]:
import pandas as pd
import json


# Dental dataset
#from datasets import load_dataset
#ds = load_dataset("Lines/Open-Domain-Oral-Disease-QA-Dataset")

path = "70-fixed-batch-inference.xlsx"

# Load dataset
df = pd.read_excel(path, sheet_name="review")
print('Original dataset length', len(df))

# Filter dataset
df = df.drop(df[df['reference_output_69'].str.find('PARSE_FAILED') != -1].index)
print('Filtered dataset length', len(df))

df['output'] = df.apply(lambda x: json.loads(x['reference_output_69']), axis=1)
df['input'] = df['raw_input'].copy()
print("Max input length", df['input'].str.len().max())

c = {}
for index, row in df.iterrows():
    x = row['output']
    for key, value in x.items():
        c[key] = c.get(key, 0) + int(len(value) > 0)

categories = sorted(list(c.keys()))

# Validation of the dataset
for index, row in df.iterrows():
    a=[0] * len(categories)
    try:
        x = json.loads(row['reference_output_69'])
        if type(x) is dict:
            # Check dental features for ex.
            if 'dental_features' in x and len(x['dental_features'])>0: 
                a.append(1)
            else:
                a.append(0)
        else:
            print(x['answer'])
    except ValueError:  # includes simplejson.decoder.JSONDecodeError
        print('Decoding JSON has failed')

# print(df['output'].loc[df.index[2]])
for i in categories:
    print(c[i], '\t', i)
df['output_vector'] = df.apply(lambda x: [int(len(x['output'].get(i, [0])) > 0) for i in categories], axis=1)
df['output_vector']

Original dataset length 5492
Filtered dataset length 5451
Max input length 2078
2141 	 alignment
395 	 as_previous
1197 	 bite
181 	 crowding
1375 	 dental_features
472 	 finishing
485 	 leveling
272 	 midline
95 	 non_clinical_reason_for_new_order
1174 	 occlusion
1833 	 other_instructions
484 	 overcorrection_aligners
689 	 passive_aligners
1630 	 polite_expressions
55 	 request_for_clin_check
199 	 skip_active_treatment
1272 	 spaces
2648 	 teeth_movements
124 	 tracking
394 	 treatment_length


0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...
1       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...
3       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
5487    [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5488    [1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...
5489    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...
5490    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5491    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: output_vector, Length: 5451, dtype: object

In [2]:
import plotly.express as px

df['input_size'] = df.apply(lambda x: len(x['input']), axis=1)

fig = px.histogram(df, x="input_size")
fig.show()

In [3]:
d_sorted = {k: v for k, v in sorted(c.items(), key = lambda item: item[1], reverse=True)}

d = {}
d['category'] = [ key for key in d_sorted.keys()]
d['values'] = [value for value in d_sorted.values()]

fig = px.histogram(d, x='category', y='values')
fig.show()

In [11]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device count:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print("Device", i, "name:", torch.cuda.get_device_name(i))
print("Device:", device)

device_name = device.type + ":" + str(torch.cuda.current_device())
print("Device name:", device_name)

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Total Memory:    ', round(torch.cuda.get_device_properties(0).total_memory/1024**3,1), 'GB')
    print('Allocated Memory:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached Memory:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Device count: 1
Device 0 name: NVIDIA GeForce RTX 4070 SUPER
Device: cuda
Device name: cuda:0
NVIDIA GeForce RTX 4070 SUPER
Total Memory:     12.0 GB
Allocated Memory: 0.0 GB
Cached Memory:    0.0 GB


In [18]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

folders = {
    "BERT": "bert_multiclass_model_2_last_retrained",
    "ELECTRA": "electra_multiclass_model_2_last_retrained",
    "RoBERTa": "roberta_multiclass_2_last_layers"
}

for k, v in folders.items():
    model = AutoModelForSequenceClassification.from_pretrained(v, local_files_only=True, device_map=device_name)
    tokenizer = AutoTokenizer.from_pretrained(v, local_files_only=True, device_map=device_name)
    break

In [9]:
from sklearn.model_selection import train_test_split

# Prepare data for training
X_train, X_val, y_train, y_val = train_test_split(df['input'].tolist(), df['output_vector'].tolist(), test_size=0.2, shuffle=False)

In [13]:
pd.DataFrame(X_val).rename(columns={0: 'input'}).to_csv('val_input.csv', index=True)

In [14]:
df = pd.DataFrame(X_val).rename(columns={0: 'input'})

In [15]:
df

Unnamed: 0,input
0,[NumberingSystem]unknown\n\n[FormInstructionsU...
1,[NumberingSystem]unknown\n\n[FormInstructionsU...
2,[NumberingSystem]unknown\n\n[FormInstructionsU...
3,[NumberingSystem]FDI\n\n[FormInstructionsUpper...
4,[NumberingSystem]unknown\n\n[FormInstructionsU...
...,...
1086,[NumberingSystem]FDI\n\n[FormInstructionsUpper...
1087,[NumberingSystem]FDI\n\n[FormInstructionsUpper...
1088,[NumberingSystem]unknown\n\n[FormInstructionsU...
1089,[NumberingSystem]FDI\n\n[FormInstructionsUpper...


In [18]:
#read to p_str from ppp.txt
with open('ppp.txt', 'r') as file:
    p_str = file.read()



In [19]:
p_str

'You are an AI assistant that converts natural language text into a structured JSON format.\n\nThis structured JSON format have a name SPICE-L (SPecial Instruction Conversion Language).\nSPICE-L is designed for use with invisalign treatment planning engine to generate orthodontic treatment plans automatically according to doctor instructions.\n\nYou need to transform free form text instructions of a Doctor into json object in SPICE-L format.\n\n# Categories of parameters\nSPICE-L organizes an aspects of instructions into several categories:\n- passive_aligners\n  - add\n  - forbid\n- overcorrection_aligners\n  - movement\n  - space_closure\n  - forbid\n- dental_features\n  - add\n  - forbid_placement\n  - remove_existing\n  - keep_existing\n  - apply_on_stage\n  - remove_from_stage\n- teeth_movements\n  - unmovable_teeth\n  - other\n- spaces\n  - close\n  - other\n- skip_active_treatment\n- polite_expressions\n- non_clinical_reason_for_new_order\n- tracking\n- alignment\n- leveling\n- 

In [43]:
from openai import OpenAI
client = OpenAI()

def predict(row):
  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {
        "role": "system",
        "content": [
          {
            "type": "text",
            "text": p_str
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": row
          }
        ]
      }
    ],
    temperature=0,
    max_tokens=750,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    response_format={
      "type": "text"
    }
  )
  try:
    return json.loads(response.choices[0].message.content.replace("```json\n", "").replace("```", ""))
  except:
    return 'PARSE FAILED'

In [44]:
ttt = df.head(3).copy()
ttt['predict'] = ttt['input'].apply(predict)

In [45]:
ttt

Unnamed: 0,input,predict
0,[NumberingSystem]unknown\n\n[FormInstructionsU...,"{'passive_aligners': {}, 'overcorrection_align..."
1,[NumberingSystem]unknown\n\n[FormInstructionsU...,"{'passive_aligners': {}, 'overcorrection_align..."
2,[NumberingSystem]unknown\n\n[FormInstructionsU...,"{'passive_aligners': {'add': [{'jaw': 'upper',..."


In [46]:
def convert_to_vector(d):
    return [int(len(d.get(i, [0])) > 0) for i in categories]

In [None]:
ttt['predict_vector'] = ttt['predict'].apply(convert_to_vector)

In [50]:
from tqdm import tqdm
tqdm.pandas()

ttt['predict_vector'] = ttt['predict'].progress_apply(convert_to_vector)

100%|██████████| 3/3 [00:00<00:00, 1512.91it/s]


In [51]:
ttt

Unnamed: 0,input,predict,predict_vector
0,[NumberingSystem]unknown\n\n[FormInstructionsU...,"{'passive_aligners': {}, 'overcorrection_align...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ..."
1,[NumberingSystem]unknown\n\n[FormInstructionsU...,"{'passive_aligners': {}, 'overcorrection_align...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[NumberingSystem]unknown\n\n[FormInstructionsU...,"{'passive_aligners': {'add': [{'jaw': 'upper',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ..."
