# Content: 
## Load dataset
## Load T5 Model
## Run predictions
### Classification report

## Load Dataset

In [2]:
import transformers
import pandas as pd

from datasets import load_dataset
from app.model import IntentClassifier


dataset = load_dataset("tuetschek/atis")

In [3]:

# dataset["intent"].value_counts()
dataset["train"]

Dataset({
    features: ['id', 'intent', 'text', 'slots'],
    num_rows: 4978
})

## Examples
flight-Flight search: what flights are available from pittsburgh to baltimore on thursday morning
flight_time-Flight time questions: what is the arrival time in san francisco for the 755 am flight leaving washington
airfare-Check costs: show me the first class fares from boston to denver
ground_service-


In [4]:
dataset["train"][0]

{'id': 0,
 'intent': 'flight',
 'text': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
 'slots': 'O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day'}

In [5]:
intents = set([row["intent"] for row in dataset["train"]])
intents

{'abbreviation',
 'aircraft',
 'aircraft+flight+flight_no',
 'airfare',
 'airfare+flight_time',
 'airline',
 'airline+flight_no',
 'airport',
 'capacity',
 'cheapest',
 'city',
 'distance',
 'flight',
 'flight+airfare',
 'flight_no',
 'flight_time',
 'ground_fare',
 'ground_service',
 'ground_service+ground_fare',
 'meal',
 'quantity',
 'restriction'}

In [6]:
from app.atis.utils import ATIS_INTENT_MAPPING as intent_mapping
intent_mapping

{'abbreviation': 'Abbreviation and Fare Code Meaning Inquiry',
 'aircraft': 'Aircraft Type Inquiry',
 'airfare': 'Airfare Information Requests',
 'airline': 'Airline Information Request',
 'airport': 'Airport Information and Queries',
 'capacity': 'Aircraft Seating Capacity Inquiry',
 'cheapest': 'Cheapest Fare Inquiry',
 'city': 'Airport Location Inquiry',
 'distance': 'Airport Distance Inquiry',
 'flight': 'Flight Booking Request',
 'flight_no': 'Flight Number Inquiry',
 'flight_time': 'Flight Schedule Inquiry',
 'ground_fare': 'Ground Transportation Cost Inquiry',
 'ground_service': 'Ground Transportation Inquiry',
 'ground_service+ground_fare': 'Airport Ground Transportation and Cost Query',
 'meal': 'Inquiry about In-flight Meals',
 'quantity': 'Flight Quantity Inquiry',
 'restriction': 'Flight Restriction Inquiry'}

In [7]:
# get 10 samples from each intent
intent_samples = {intent: [] for intent in intents}

# Iterate through the dataset
for row in dataset["train"]:
    intent = row["intent"]
    # If we haven't collected 10 samples for this intent yet, add the current row
    if len(intent_samples[intent]) < 5:
        intent_samples[intent].append(row["text"])
    # If we have collected 10 samples for this intent, move on to the next intent
    else:
        continue

In [8]:
intent_samples

{'meal': ['show me all meals on flights from atlanta to washington',
  'is there a meal on delta flight 852 from san francisco to dallas fort worth',
  'what are all the available meals',
  'what are my meal options from boston to denver',
  'do i get a meal on the atlanta to bwi flight eastern 210'],
 'distance': ['how far is it from orlando airport to orlando',
  'what is the distance from los angeles international airport to los angeles',
  'tell me distance from orlando airport to the city',
  'how far is downtown from the airport in dallas',
  'how long does it take to get from atlanta airport into the city of atlanta'],
 'airfare+flight_time': ['show me the costs and times for flights from san francisco to atlanta'],
 'airline+flight_no': ['airline and flight number from columbus to minneapolis',
  'may i please see airlines and flight numbers from new york to toronto on the same date june seventeenth also arriving in toronto before noon thank you'],
 'airline': ['which airline s

# Load model and run one prediction

In [9]:
model = IntentClassifier(model_name="Serj/intent-classifier")

main Serj/intent-classifier


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
prompt_options = "OPTIONS\n"
index = 1
for intent in intents:
    if intent not in intent_mapping:
        continue
        
    mapping = intent_mapping[intent]
    prompt_options += f" {index}. {mapping} "
    index += 1
prompt_options

'OPTIONS\n 1. Inquiry about In-flight Meals  2. Airport Distance Inquiry  3. Airline Information Request  4. Airfare Information Requests  5. Flight Restriction Inquiry  6. Aircraft Type Inquiry  7. Flight Number Inquiry  8. Ground Transportation Inquiry  9. Abbreviation and Fare Code Meaning Inquiry  10. Flight Booking Request  11. Airport Information and Queries  12. Flight Schedule Inquiry  13. Aircraft Seating Capacity Inquiry  14. Airport Location Inquiry  15. Flight Quantity Inquiry  16. Cheapest Fare Inquiry  17. Airport Ground Transportation and Cost Query  18. Ground Transportation Cost Inquiry '

In [11]:
(dataset["train"][0]["text"], dataset["train"][0]["intent"], intent_mapping[dataset["train"][0]["intent"]])

('i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
 'flight',
 'Flight Booking Request')

In [12]:
company_name = "Atis Airlines"
company_specific = "An Airline company"
customer_text = dataset["train"][0]["text"]
print(customer_text)
model.predict(customer_text, prompt_options, company_name, company_specific)

i want to fly from boston at 838 am and arrive in denver at 1110 in the morning




'Flight Booking Request'

In [13]:
company_name = "Atis Airlines"
company_specific = "An Airline company"
model.predict(dataset["train"][700]["text"], prompt_options, company_name, company_specific)

'Flight Booking Request'

In [14]:
model.raw_predict(f"All of the verbs: {customer_text}")

'arrive, morning, fly'

## Train set

In [16]:
# from tqdm import tqdm
# results = []
# for row in tqdm(dataset["train"]):
#     intent = row["intent"] 
#     if intent not in intent_mapping:
#         continue 
    
#     prediction = model.predict(row["text"], prompt_options, company_name, company_specific)
#     keywords = model.raw_predict(f"All of the verbs: {row['text']}")
#     results.append({"prediction": prediction, "y": intent_mapping[intent], "keywords": keywords, "text": row["text"]})

In [None]:
# results

In [None]:
# from sklearn.metrics import classification_report
# y = [r["y"] for r in results]
# predictions = [r["prediction"].replace("Class name: ","") for r in results]
# pd.DataFrame(classification_report(y, predictions, output_dict=True)).T

In [None]:
# import pandas as pd
# df = pd.DataFrame(results)
# df["prediction"].value_counts()

## Test set

In [None]:
from tqdm import tqdm
results = []
for row in tqdm(dataset["test"]):
    intent = row["intent"] 
    if intent not in intent_mapping:
        continue 
    
    prediction = model.predict(row["text"], prompt_options, company_name, company_specific)
    keywords = model.raw_predict(f"All of the verbs: {row['text']}")
    results.append({"prediction": prediction, "y": intent_mapping[intent]})

 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 786/893 [00:46<00:07, 14.09it/s]

In [None]:
from sklearn.metrics import classification_report
y = [r["y"] for r in results]
predictions = [r["prediction"].replace("Class name: ","") for r in results]
pd.DataFrame(classification_report(y, predictions, output_dict=True)).T