In [None]:
import numpy as np 
import pandas as pd

In [None]:
df = pd.read_csv("intent_dataset_NEW.csv")
df.describe()

In [None]:
df.intent.value_counts()

In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical




df = df.dropna(subset=["sentence", "intent"])
df = df[df["sentence"].str.strip() != ""]
df = df.reset_index(drop=True)

print("Intent Distribution:\n")
print(df['intent'].value_counts())

X = df["sentence"]
y = df["intent"]



label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("\nClasses:", label_encoder.classes_)

num_classes = len(label_encoder.classes_)



X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)



max_words = 20000   # vocabulary size
max_len = 30        # max sentence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

print("\nTrain shape:", X_train_pad.shape)
print("Test shape:", X_test_pad.shape)



y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)



model = Sequential([
    Embedding(input_dim=max_words,
              output_dim=128,
              input_length=max_len),

    Bidirectional(LSTM(64, return_sequences=False,
                       dropout=0.3,
                       recurrent_dropout=0.3)),

    Dense(64, activation='relu'),
    Dropout(0.5),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\nModel Summary:\n")
model.summary()



history = model.fit(
    X_train_pad,
    y_train_cat,
    epochs=10,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)



loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print("\nTest Accuracy:", round(accuracy, 4))



y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)

print("\nClassification Report:\n")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)


def inspect_sample(i):
    print("\nSentence:\n", X_test.iloc[i])
    print("True Label:", label_encoder.inverse_transform([y_test[i]])[0])
    print("Predicted:", label_encoder.inverse_transform([y_pred[i]])[0])

inspect_sample(0)


In [None]:
import pickle

# 1️⃣ Save trained LSTM model
model.save("intent_lstm_model.keras")

# 2️⃣ Save tokenizer (word index mapping)
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# 3️⃣ Save label encoder (intent mapping)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("All LSTM components saved successfully!")

In [None]:
max_len = 30  # same value used in training

def predict_intent(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    prediction = model.predict(padded)
    class_index = np.argmax(prediction)

    return label_encoder.inverse_transform([class_index])[0]


print(predict_intent("Label encode gender column"))
