In [1]:
import transformers
import pandas as pd

from datasets import load_dataset


dataset = load_dataset("tuetschek/atis")

In [2]:
intents = set([row["intent"] for row in dataset["train"]])
intents


{'abbreviation',
 'aircraft',
 'aircraft+flight+flight_no',
 'airfare',
 'airfare+flight_time',
 'airline',
 'airline+flight_no',
 'airport',
 'capacity',
 'cheapest',
 'city',
 'distance',
 'flight',
 'flight+airfare',
 'flight_no',
 'flight_time',
 'ground_fare',
 'ground_service',
 'ground_service+ground_fare',
 'meal',
 'quantity',
 'restriction'}

In [9]:
from app.atis.utils import ATIS_INTENT_MAPPING as intent_mapping
intent_mapping

{'abbreviation': 'Abbreviation and Fare Code Meaning Inquiry',
 'aircraft': 'Aircraft Type Inquiry',
 'airfare': 'Airfare Information Requests',
 'airline': 'Airline Information Request',
 'airport': 'Airport Information and Queries',
 'capacity': 'Aircraft Seating Capacity Inquiry',
 'cheapest': 'Cheapest Fare Inquiry',
 'city': 'Airport Location Inquiry',
 'distance': 'Airport Distance Inquiry',
 'flight': 'Flight Booking Request',
 'flight_no': 'Flight Number Inquiry',
 'flight_time': 'Flight Schedule Inquiry',
 'ground_fare': 'Ground Transportation Cost Inquiry',
 'ground_service': 'Ground Transportation Inquiry',
 'ground_service+ground_fare': 'Airport Ground Transportation and Cost Query',
 'meal': 'Inquiry about In-flight Meals',
 'quantity': 'Flight Quantity Inquiry',
 'restriction': 'Flight Restriction Inquiry'}

In [10]:
results = []
for row in dataset["train"]:
    if row["intent"] not in intent_mapping:
        continue
    results.append({"id": row["id"], "intent": intent_mapping[row["intent"]], "text": row["text"]})

In [11]:
df = pd.DataFrame(results)
df["intent"].value_counts()

intent
Flight Booking Request                          3666
Airfare Information Requests                     423
Ground Transportation Inquiry                    255
Airline Information Request                      157
Abbreviation and Fare Code Meaning Inquiry       147
Aircraft Type Inquiry                             81
Flight Schedule Inquiry                           54
Flight Quantity Inquiry                           51
Airport Information and Queries                   20
Airport Distance Inquiry                          20
Airport Location Inquiry                          19
Ground Transportation Cost Inquiry                18
Aircraft Seating Capacity Inquiry                 16
Flight Number Inquiry                             12
Inquiry about In-flight Meals                      6
Flight Restriction Inquiry                         6
Airport Ground Transportation and Cost Query       1
Cheapest Fare Inquiry                              1
Name: count, dtype: int64

In [36]:
# sample 15 
all_classes_df = pd.DataFrame()
intents = df["intent"].unique().tolist()
sample_size = 15
for intent in intents:
    intent_df = df[df["intent"] == intent]
    curr_min_sample_size = min(intent_df.shape[0], sample_size)
    sampled_df = intent_df.sample(curr_min_sample_size)
    all_classes_df = pd.concat([all_classes_df, sampled_df])

all_classes_df.shape[0]

221

In [16]:
all_classes_df.iloc[0]

id                                                      449
intent                               Flight Booking Request
text      show me flights from denver to atlanta on june...
Name: 449, dtype: object

In [30]:
# sort labels alphanumerically 
sorted_list = all_classes_df["intent"].unique().tolist()
sorted_list.sort()
sorted_list
label_to_int = {sorted_list[i]:i+1 for i in range(len(sorted_list))}
label_to_int

{'Abbreviation and Fare Code Meaning Inquiry': 1,
 'Aircraft Seating Capacity Inquiry': 2,
 'Aircraft Type Inquiry': 3,
 'Airfare Information Requests': 4,
 'Airline Information Request': 5,
 'Airport Distance Inquiry': 6,
 'Airport Ground Transportation and Cost Query': 7,
 'Airport Information and Queries': 8,
 'Airport Location Inquiry': 9,
 'Cheapest Fare Inquiry': 10,
 'Flight Booking Request': 11,
 'Flight Number Inquiry': 12,
 'Flight Quantity Inquiry': 13,
 'Flight Restriction Inquiry': 14,
 'Flight Schedule Inquiry': 15,
 'Ground Transportation Cost Inquiry': 16,
 'Ground Transportation Inquiry': 17,
 'Inquiry about In-flight Meals': 18}

In [43]:
intent_classification_dataset = {"Company Name": "Atis Arilines", "Taxonomy": {}}
examples = []
print(f"all classes df size: {all_classes_df.shape}")
for class_name, class_index in label_to_int.items():
    curr_df = all_classes_df[all_classes_df["intent"] == class_name]
    for index, row in curr_df.iterrows():
        examples.append({
            "sample_text": row["text"],
            "class": class_name,
            "class number": class_index
        })
intent_classification_dataset["Examples"] = examples
len(examples)
# intent_classification_dataste["Examples"]

all classes df size: (221, 3)


221

In [44]:
import json
with open('data/atis_train_sample_dataset.json', 'w') as fp:
    json.dump(intent_classification_dataset, fp)