## Importing libraries

In [94]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

In [64]:
# Download the stopwords from NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:

#pd.read_csv("data/atis_intents_train.csv",header=None)

In [66]:
#pd.read_csv("data/atis_intents_test.csv",header=None)

In [67]:
columns = ['intent','text']

In [68]:
df = pd.read_csv("data/atis_intents.csv",header=None,names=columns)

In [69]:
df

Unnamed: 0,intent,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
...,...,...
4973,atis_airfare,what is the airfare for flights from denver t...
4974,atis_flight,do you have any flights from denver to baltim...
4975,atis_airline,which airlines fly into and out of denver
4976,atis_flight,does continental fly from boston to san franc...


In [70]:
for i in df['text']:
    print(i)

 i want to fly from boston at 838 am and arrive in denver at 1110 in the morning
 what flights are available from pittsburgh to baltimore on thursday morning
 what is the arrival time in san francisco for the 755 am flight leaving washington
 cheapest airfare from tacoma to orlando
 round trip fares from pittsburgh to philadelphia under 1000 dollars
 i need a flight tomorrow from columbus to minneapolis
 what kind of aircraft is used on a flight from cleveland to dallas
 show me the flights from pittsburgh to los angeles on thursday
 all flights from boston to washington
 what kind of ground transportation is available in denver
 show me the flights from dallas to san francisco
 show me the flights from san diego to newark by way of houston
 what's the airport at orlando
 what is the cheapest flight from boston to bwi
 all flights to baltimore after 6 pm
 show me the first class fares from boston to denver
 show me the ground transportation in denver
 all flights from denver to pittsbu

## Information about the data

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978 entries, 0 to 4977
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   intent  4978 non-null   object
 1   text    4978 non-null   object
dtypes: object(2)
memory usage: 77.9+ KB


In [72]:
df.shape

(4978, 2)

In [73]:
df.describe()

Unnamed: 0,intent,text
count,4978,4978
unique,22,4634
top,atis_flight,what is fare code h
freq,3666,8


In [74]:
df['intent'].value_counts()

intent
atis_flight                                 3666
atis_airfare                                 423
atis_ground_service                          255
atis_airline                                 157
atis_abbreviation                            147
atis_aircraft                                 81
atis_flight_time                              54
atis_quantity                                 51
atis_flight#atis_airfare                      21
atis_airport                                  20
atis_distance                                 20
atis_city                                     19
atis_ground_fare                              18
atis_capacity                                 16
atis_flight_no                                12
atis_meal                                      6
atis_restriction                               6
atis_airline#atis_flight_no                    2
atis_ground_service#atis_ground_fare           1
atis_airfare#atis_flight_time                  1
atis_cheapest

##### Missing Values checking

In [75]:
df.isnull().sum()

intent    0
text      0
dtype: int64

In [76]:
df[df.duplicated()]

Unnamed: 0,intent,text
261,atis_abbreviation,what is fare code h
278,atis_flight,pittsburgh to denver
283,atis_flight,flights from boston to pittsburgh
285,atis_airfare,show me the fares from dallas to san francisco
542,atis_flight,show me flights from pittsburgh to philadelphia
...,...,...
4936,atis_flight,newark to cleveland
4939,atis_flight,show me flights from denver to philadelphia
4940,atis_ground_service,show me ground transportation in denver
4949,atis_flight,show me the flights from baltimore to oakland


In [77]:
(
    df
    .loc[df.duplicated(keep=False)]
    .sort_values(['text'])
)

Unnamed: 0,intent,text
3795,atis_airport,airports in new york
2412,atis_airport,airports in new york
2874,atis_flight#atis_airfare,all flights and fares from atlanta to dallas ...
602,atis_flight#atis_airfare,all flights and fares from atlanta to dallas ...
859,atis_flight#atis_airfare,all flights and fares from atlanta to dallas ...
...,...,...
64,atis_airline,which airlines have first class flights today
4329,atis_flight,which flights are between boston and baltimor...
1481,atis_flight,which flights are between boston and baltimor...
2012,atis_flight,which united airlines flights go through denver


#### Dropping the Duplicated data

In [82]:
df = df.drop_duplicates()

In [84]:
df.duplicated().sum()

0

## Data Preprocessing

In [79]:
# Define a function for text cleaning
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['text'] = df['text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['intent'] = label_encoder.fit_transform(df['intent'])

# Display the first few rows to verify the changes
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['intent'] = label_encoder.fit_transform(df['intent'])


Unnamed: 0,intent,text
0,12,want fly boston 838 arrive denver 1110 morning
1,12,flights available pittsburgh baltimore thursda...
2,15,arrival time san francisco 755 flight leaving ...
3,3,cheapest airfare tacoma orlando
4,3,round trip fares pittsburgh philadelphia 1000 ...


In [86]:
pd.DataFrame(df['intent'].value_counts())

Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
12,3289
3,400
17,208
5,138
0,99
1,77
15,51
20,47
11,20
10,18


In [88]:

tfidf_vector = TfidfVectorizer(stop_words='english', max_features=5000)

X = tfidf_vector.fit_transform(df['text'])

print(f"TF-IDF matrix shape: {X.shape}")


df.head()

TF-IDF matrix shape: (4443, 712)


Unnamed: 0,intent,text
0,12,want fly boston 838 arrive denver 1110 morning
1,12,flights available pittsburgh baltimore thursda...
2,15,arrival time san francisco 755 flight leaving ...
3,3,cheapest airfare tacoma orlando
4,3,round trip fares pittsburgh philadelphia 1000 ...


## Model Training 

In [95]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['intent'], test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate the models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Display the results
results


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.8942632170978627,
  'Precision': 0.8750141339330848,
  'Recall': 0.8942632170978627,
  'F1 Score': 0.8703390441643475},
 'SVM': {'Accuracy': 0.9358830146231721,
  'Precision': 0.9310614282162608,
  'Recall': 0.9358830146231721,
  'F1 Score': 0.9270842144272501},
 'Random Forest': {'Accuracy': 0.9268841394825647,
  'Precision': 0.9182659366572552,
  'Recall': 0.9268841394825647,
  'F1 Score': 0.9182571705087855}}

In [101]:
pd.DataFrame(results)

Unnamed: 0,Logistic Regression,SVM,Random Forest
Accuracy,0.894263,0.935883,0.926884
Precision,0.875014,0.931061,0.918266
Recall,0.894263,0.935883,0.926884
F1 Score,0.870339,0.927084,0.918257


In [92]:
# Define the maximum number of words and the maximum sequence length
max_words = 5000
max_len = 100

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Convert the text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Split the data into training and testing sets
X_train_seq, X_test_seq, y_train, y_test = train_test_split(padded_sequences, df['intent'], test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(22, activation='softmax'))  # 22 is the number of unique intents

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with 10 epochs and used early_stopping 
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_seq, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
y_pred_seq = model.predict(X_test_seq)
y_pred_classes = y_pred_seq.argmax(axis=-1)

accuracy = accuracy_score(y_test, y_pred_classes)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_classes, average='weighted')

# Display the evaluation metrics
lstm_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1
}

lstm_results

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.7469066366704162,
 'Precision': 0.5578695239023131,
 'Recall': 0.7469066366704162,
 'F1 Score': 0.6386941490652368}

## Observations

* The SVM model is the best among the four models including LSTM based on the provided metrics. 
* It has the highest accuracy, precision, recall, and F1 score, making it the most effective model for intent classification on this dataset.