In [1]:
import transformers
import pandas as pd

from datasets import load_dataset
from app.model import IntentClassifier


dataset = load_dataset("tuetschek/atis")

In [2]:
from app.atis.utils import ATIS_INTENT_MAPPING as intent_mapping
intent_mapping

{'abbreviation': 'Abbreviation and Fare Code Meaning Inquiry',
 'aircraft': 'Aircraft Type Inquiry',
 'airfare': 'Airfare Information Requests',
 'airline': 'Airline Information Request',
 'airport': 'Airport Information and Queries',
 'capacity': 'Aircraft Seating Capacity Inquiry',
 'cheapest': 'Cheapest Fare Inquiry',
 'city': 'Airport Location Inquiry',
 'distance': 'Airport Distance Inquiry',
 'flight': 'Flight Booking Request',
 'flight_no': 'Flight Number Inquiry',
 'flight_time': 'Flight Schedule Inquiry',
 'ground_fare': 'Ground Transportation Cost Inquiry',
 'ground_service': 'Ground Transportation Inquiry',
 'ground_service+ground_fare': 'Airport Ground Transportation and Cost Query',
 'meal': 'Inquiry about In-flight Meals',
 'quantity': 'Flight Quantity Inquiry',
 'restriction': 'Flight Restriction Inquiry'}

In [35]:
from tqdm import tqdm
import datasets

results = []
test_set = []
clean_test_set = []

labels = list(intent_mapping.values())
label_to_id = {}
for i in range(len(labels)):
    label_to_id[labels[i]] = i
    
for row in tqdm(dataset["test"]):
    intent = row["intent"] 
    if intent not in intent_mapping:
        continue 
    # row["intent"] = intent_mapping[row["intent"]]
    test_set.append(row)
    new_name = intent_mapping[row["intent"]]
    clean_test_set.append({"label": label_to_id[new_name], "text": row["text"]})

test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=clean_test_set))
test_set[0]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 893/893 [00:00<00:00, 48588.76it/s]


{'id': 0,
 'intent': 'flight',
 'text': 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',
 'slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O O O O O B-stoploc.city_name I-stoploc.city_name'}

## Train Setfit Zero shot 


In [36]:
from setfit import get_templated_dataset

print(labels)
train_dataset = get_templated_dataset(candidate_labels=labels, sample_size=8)
print(train_dataset[0])



['Abbreviation and Fare Code Meaning Inquiry', 'Aircraft Type Inquiry', 'Airfare Information Requests', 'Airline Information Request', 'Airport Information and Queries', 'Aircraft Seating Capacity Inquiry', 'Cheapest Fare Inquiry', 'Airport Location Inquiry', 'Airport Distance Inquiry', 'Flight Booking Request', 'Flight Number Inquiry', 'Flight Schedule Inquiry', 'Ground Transportation Cost Inquiry', 'Ground Transportation Inquiry', 'Airport Ground Transportation and Cost Query', 'Inquiry about In-flight Meals', 'Flight Quantity Inquiry', 'Flight Restriction Inquiry']
{'text': 'This sentence is Abbreviation and Fare Code Meaning Inquiry', 'label': 0}


In [37]:
from setfit import SetFitModel, Trainer, TrainingArguments

model = SetFitModel.from_pretrained("BAAI/bge-small-en-v1.5")

args = TrainingArguments(
    batch_size=32,
    num_epochs=1,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 19584
  Batch size = 32
  Num epochs = 1
  Total optimization steps = 612


Step,Training Loss


In [38]:
metrics = trainer.evaluate()
print(metrics)

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.2408675799086758}


In [39]:
model.predict("Hey, how can I get a flight?")

tensor(9)

## Lets try a differrent sentence transformers model

In [40]:
from setfit import SetFitModel, Trainer, TrainingArguments

model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

args = TrainingArguments(
    batch_size=32,
    num_epochs=1,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()



config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 19584
  Batch size = 32
  Num epochs = 1
  Total optimization steps = 612


Step,Training Loss


In [41]:
metrics = trainer.evaluate()
print(metrics)

***** Running evaluation *****


{'accuracy': 0.1415525114155251}
