## Importing libraries

In [136]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
import tensorflow as tf
import joblib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

In [137]:
# Download the stopwords from NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [138]:

#pd.read_csv("data/atis_intents_train.csv",header=None)

In [139]:
#pd.read_csv("data/atis_intents_test.csv",header=None)

In [140]:
columns = ['intent','text']

In [141]:
df = pd.read_csv("data/atis_intents.csv",header=None,names=columns)

In [142]:
df

Unnamed: 0,intent,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
...,...,...
4973,atis_airfare,what is the airfare for flights from denver t...
4974,atis_flight,do you have any flights from denver to baltim...
4975,atis_airline,which airlines fly into and out of denver
4976,atis_flight,does continental fly from boston to san franc...


In [163]:
df.loc[2]

intent                                                   15
text      arrival time san francisco 755 flight leaving ...
Name: 2, dtype: object

In [143]:
df['text'].sample(15)


1058     i'd like a ticket from denver to atlanta with...
2297                                   what airline is hp
1199     what kind of aircraft does delta fly before 8...
3998     are there any flights from new york to montre...
309             show me all the flights leaving baltimore
2954     again i will repeat i want to make a one way ...
1293     first class american flight from philadelphia...
2430     show me flights from pittsburgh to san franci...
383      show me times for flights from san francisco ...
2373     now i'd like to see flights from detroit to s...
2027     i'd like information on the least expensive a...
2631      what are the flights from pittsburgh to oakland
1212     find a flight from san francisco to boston on...
2655     now i want to see return flights from miami t...
3905             what airlines fly from burbank to denver
Name: text, dtype: object

## Information about the data

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978 entries, 0 to 4977
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   intent  4978 non-null   object
 1   text    4978 non-null   object
dtypes: object(2)
memory usage: 77.9+ KB


In [145]:
df.shape

(4978, 2)

In [146]:
df.describe()

Unnamed: 0,intent,text
count,4978,4978
unique,22,4634
top,atis_flight,what is fare code h
freq,3666,8


In [147]:
df['intent'].value_counts()

intent
atis_flight                                 3666
atis_airfare                                 423
atis_ground_service                          255
atis_airline                                 157
atis_abbreviation                            147
atis_aircraft                                 81
atis_flight_time                              54
atis_quantity                                 51
atis_flight#atis_airfare                      21
atis_airport                                  20
atis_distance                                 20
atis_city                                     19
atis_ground_fare                              18
atis_capacity                                 16
atis_flight_no                                12
atis_meal                                      6
atis_restriction                               6
atis_airline#atis_flight_no                    2
atis_ground_service#atis_ground_fare           1
atis_airfare#atis_flight_time                  1
atis_cheapest

##### Missing Values checking

In [148]:
df.isnull().sum()

intent    0
text      0
dtype: int64

In [149]:
df[df.duplicated()]

Unnamed: 0,intent,text
261,atis_abbreviation,what is fare code h
278,atis_flight,pittsburgh to denver
283,atis_flight,flights from boston to pittsburgh
285,atis_airfare,show me the fares from dallas to san francisco
542,atis_flight,show me flights from pittsburgh to philadelphia
...,...,...
4936,atis_flight,newark to cleveland
4939,atis_flight,show me flights from denver to philadelphia
4940,atis_ground_service,show me ground transportation in denver
4949,atis_flight,show me the flights from baltimore to oakland


In [150]:
(
    df
    .loc[df.duplicated(keep=False)]
    .sort_values(['text'])
)

Unnamed: 0,intent,text
3795,atis_airport,airports in new york
2412,atis_airport,airports in new york
2874,atis_flight#atis_airfare,all flights and fares from atlanta to dallas ...
602,atis_flight#atis_airfare,all flights and fares from atlanta to dallas ...
859,atis_flight#atis_airfare,all flights and fares from atlanta to dallas ...
...,...,...
64,atis_airline,which airlines have first class flights today
4329,atis_flight,which flights are between boston and baltimor...
1481,atis_flight,which flights are between boston and baltimor...
2012,atis_flight,which united airlines flights go through denver


#### Dropping the Duplicated data

In [151]:
df = df.drop_duplicates()

In [152]:
df.duplicated().sum()

0

## Data Preprocessing

In [153]:
# Define a function for text cleaning
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['text'] = df['text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['intent'] = label_encoder.fit_transform(df['intent'])


label_encoder_filename = 'label_encoder.joblib'
joblib.dump(label_encoder, label_encoder_filename)

# Display the first few rows to verify the changes
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['intent'] = label_encoder.fit_transform(df['intent'])


Unnamed: 0,intent,text
0,12,want fly boston 838 arrive denver 1110 morning
1,12,flights available pittsburgh baltimore thursda...
2,15,arrival time san francisco 755 flight leaving ...
3,3,cheapest airfare tacoma orlando
4,3,round trip fares pittsburgh philadelphia 1000 ...


In [154]:
pd.DataFrame(df['intent'].value_counts())

Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
12,3426
3,403
17,235
5,148
0,108
1,78
15,52
20,49
11,20
10,18


In [155]:

tfidf_vector = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 3))

X = tfidf_vector.fit_transform(df['text'])

print(f"TF-IDF matrix shape: {X.shape}")

# Save the TF-IDF vectorizer to a file
vectorizer_filename = 'tfidf_vectorizer_ngram.joblib'
joblib.dump(tfidf_vector, vectorizer_filename)
print(f"TF-IDF vectorizer saved to {vectorizer_filename}")


df.head()

TF-IDF matrix shape: (4634, 5000)
TF-IDF vectorizer saved to tfidf_vectorizer_ngram.joblib


Unnamed: 0,intent,text
0,12,want fly boston 838 arrive denver 1110 morning
1,12,flights available pittsburgh baltimore thursda...
2,15,arrival time san francisco 755 flight leaving ...
3,3,cheapest airfare tacoma orlando
4,3,round trip fares pittsburgh philadelphia 1000 ...


## Model Training 

In [156]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['intent'], test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate the models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Display the results
results


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.8813376483279396,
  'Precision': 0.8603983633080451,
  'Recall': 0.8813376483279396,
  'F1 Score': 0.8534389519874469},
 'SVM': {'Accuracy': 0.9277238403451996,
  'Precision': 0.9221302387845821,
  'Recall': 0.9277238403451996,
  'F1 Score': 0.9159594778355201},
 'Random Forest': {'Accuracy': 0.9223300970873787,
  'Precision': 0.9191844355562371,
  'Recall': 0.9223300970873787,
  'F1 Score': 0.9133288989694733}}

In [157]:
pd.DataFrame(results)

Unnamed: 0,Logistic Regression,SVM,Random Forest
Accuracy,0.881338,0.927724,0.92233
Precision,0.860398,0.92213,0.919184
Recall,0.881338,0.927724,0.92233
F1 Score,0.853439,0.915959,0.913329


In [158]:
# Define the maximum number of words and the maximum sequence length
max_words = 5000
max_len = 100

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Convert the text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Split the data into training and testing sets
X_train_seq, X_test_seq, y_train, y_test = train_test_split(padded_sequences, df['intent'], test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(22, activation='softmax'))  # 22 is the number of unique intents

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with 10 epochs and used early_stopping 
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_seq, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
y_pred_seq = model.predict(X_test_seq)
y_pred_classes = y_pred_seq.argmax(axis=-1)

accuracy = accuracy_score(y_test, y_pred_classes)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_classes, average='weighted')

# Display the evaluation metrics
lstm_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1
}

lstm_results

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.7443365695792881,
 'Precision': 0.5540369288130623,
 'Recall': 0.7443365695792881,
 'F1 Score': 0.6352408571548654}

## Observations

* The SVM model is the best among the four models including LSTM based on the provided metrics. 
* It has the highest accuracy, precision, recall, and F1 score, making it the most effective model for intent classification on this dataset.

## Hyper Parameter Tuning

In [159]:
# Define the parameter grid for Grid Search
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'rbf']
}

# Initialize the SVM model
svm = SVC(random_state=42)

# Initialize Grid Search with Cross-Validation
grid_search = GridSearchCV(svm, param_grid, refit=True, verbose=2, cv=5, n_jobs=-1)

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score}")

# Train the best model on the entire training set
best_svm = grid_search.best_estimator_
best_svm.fit(X_train, y_train)

# Predict on the test set
y_pred_best = best_svm.predict(X_test)

# Calculate evaluation metrics
best_accuracy = accuracy_score(y_test, y_pred_best)
best_precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_best, average='weighted')

# Display the evaluation metrics
print(f"Accuracy: {best_accuracy}")
print(f"Precision: {best_precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Save the model to a file
model_filename = 'best_svm_model_ngram.joblib'
joblib.dump(best_svm, model_filename)

print(f"Model saved to {model_filename}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best Parameters: {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
Best Cross-Validation Score: 0.9425410405549431
Accuracy: 0.9320388349514563
Precision: 0.9311815542677491
Recall: 0.9320388349514563
F1 Score: 0.9290386721077591
Model saved to best_svm_model_ngram.joblib


  _warn_prf(average, modifier, msg_start, len(result))


### Obervation after the Hyper parameter Tuning

* The SVM model with tuned hyperparameters is performing excellently, with high accuracy, precision, recall, and F1 score. 
* The slight imbalance in class predictions will be done by SMOTE but which isnt required almost model works well without it

In [160]:
# import joblib
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import SVC
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# from sklearn.model_selection import GridSearchCV
# from imblearn.over_sampling import SMOTE
# from collections import Counter
# import pandas as pd
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import string

# # Load the dataset
# file_path = '/mnt/data/atis_intents.csv'
# atis_data = pd.read_csv(file_path)

# # Rename columns
# atis_data.columns = ['intent', 'text']

# # Check for missing values
# missing_values = atis_data.isnull().sum()
# print(f"Missing values:\n{missing_values}")

# # Remove duplicate rows
# atis_data = atis_data.drop_duplicates()

# # Verify removal of duplicates
# remaining_duplicates = atis_data.duplicated().sum()
# print(f"Remaining duplicates: {remaining_duplicates}")

# # Download the stopwords from NLTK
# nltk.download('punkt')
# nltk.download('stopwords')

# # Define a function for text cleaning
# def clean_text(text):
#     # Convert text to lowercase
#     text = text.lower()
#     # Tokenize the text
#     tokens = word_tokenize(text)
#     # Remove punctuation
#     tokens = [word for word in tokens if word.isalnum()]
#     # Remove stop words
#     tokens = [word for word in tokens if word not in stopwords.words('english')]
#     return ' '.join(tokens)

# # Apply the text cleaning function to the 'text' column
# atis_data['text'] = atis_data['text'].apply(clean_text)

# # Encode labels
# label_encoder = LabelEncoder()
# atis_data['intent'] = label_encoder.fit_transform(atis_data['intent'])

# # Initialize the TF-IDF Vectorizer with n-grams (unigrams, bigrams, trigrams)
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 3))

# # Fit and transform the text data
# X = tfidf_vectorizer.fit_transform(atis_data['text'])

# # Save the TF-IDF vectorizer to a file
# vectorizer_filename = 'tfidf_vectorizer_ngram.joblib'
# joblib.dump(tfidf_vectorizer, vectorizer_filename)
# print(f"TF-IDF vectorizer saved to {vectorizer_filename}")

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, atis_data['intent'], test_size=0.2, random_state=42)

# # Initialize SMOTE
# smote = SMOTE(random_state=42)

# # Apply SMOTE to the training data
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# # Check the class distribution after resampling
# print(f"Class distribution after SMOTE: {Counter(y_train_resampled)}")

# # Define the parameter grid for Grid Search
# param_grid = {
#     'C': [0.1, 1, 10, 100, 1000],
#     'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#     'kernel': ['linear', 'rbf']
# }

# # Initialize the SVM model
# svm = SVC(random_state=42)

# # Initialize Grid Search with Cross-Validation
# grid_search = GridSearchCV(svm, param_grid, refit=True, verbose=2, cv=5, n_jobs=-1)

# # Perform Grid Search
# grid_search.fit(X_train_resampled, y_train_resampled)

# # Get the best parameters and the best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print(f"Best Parameters: {best_params}")
# print(f"Best Cross-Validation Score: {best_score}")

# # Train the best model on the entire training set
# best_svm = grid_search.best_estimator_
# best_svm.fit(X_train_resampled, y_train_resampled)

# # Predict on the test set
# y_pred_best = best_svm.predict(X_test)

# # Calculate evaluation metrics
# best_accuracy = accuracy_score(y_test, y_pred_best)
# best_precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_best, average='weighted', zero_division=1)

# # Display the evaluation metrics
# print(f"Accuracy: {best_accuracy}")
# print(f"Precision: {best_precision}")
# print(f"Recall: {recall}")
# print(f"F1 Score: {f1}")

# # Save the model to a file
# model_filename = 'best_svm_model_ngram.joblib'
# joblib.dump(best_svm, model_filename)
# print(f"Model saved to {model_filename}")
