In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# load the data

In [4]:
import pandas as pd

# Load the summarized CSV file
df = pd.read_csv('home_appliance_summary.csv')

# Check available columns before proceeding
print("Available columns:", df.columns)

# Ensure 'intent' column exists before mapping
if "intent" not in df.columns:
    raise KeyError("Column 'intent' not found in CSV. Check the file structure.")

# Map 'intent' values to numerical labels
df['label'] = df['intent'].map({'troubleshooting': 0, 'general_inquiry': 1, 'repair_request': 2})

# Save the updated CSV file
df.to_csv('home_appliance_summary_labeled.csv', index=False)

print("✅ Labeling complete! Output saved as 'home_appliance_summary_labeled.csv'")


Available columns: Index(['appliance', 'user_query', 'ai_response', 'intent', 'query_response',
       'summary'],
      dtype='object')
✅ Labeling complete! Output saved as 'home_appliance_summary_labeled.csv'


In [5]:
df.info()
# no null values in the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   appliance       500 non-null    object
 1   user_query      500 non-null    object
 2   ai_response     500 non-null    object
 3   intent          500 non-null    object
 4   query_response  500 non-null    object
 5   summary         500 non-null    object
 6   label           500 non-null    int64 
dtypes: int64(1), object(6)
memory usage: 27.5+ KB


In [6]:
df.head()

Unnamed: 0,appliance,user_query,ai_response,intent,query_response,summary,label
0,refrigerator,My refrigerator isn't cooling.,Check if the temperature settings are correct ...,troubleshooting,My refrigerator isn't cooling. Check if the te...,Check if the temperature settings are correct ...,0
1,refrigerator,How often should I replace my refrigerator’s w...,"Typically, every six months, but refer to your...",general_inquiry,How often should I replace my refrigerator’s w...,How often should I replace my refrigerator’s w...,1
2,dishwasher,I need to schedule a repair for my dishwasher.,Please provide your model number and issue det...,repair_request,I need to schedule a repair for my dishwasher....,I need to schedule a repair for my dishwasher....,2
3,air_conditioner,How often should I clean my AC filter?,"For optimal efficiency, clean the filter once ...",general_inquiry,How often should I clean my AC filter? For opt...,"For optimal efficiency, clean the filter once ...",1
4,refrigerator,The fridge is making a loud buzzing sound.,It might be due to the compressor or condenser...,troubleshooting,The fridge is making a loud buzzing sound. It ...,The fridge is making a loud buzzing sound. It ...,0


# Data Pre-Processing

In [7]:
df['intent'].value_counts()

intent
troubleshooting    308
general_inquiry    108
repair_request      84
Name: count, dtype: int64

In [8]:
X = df['summary']
y = df['intent']

In [9]:
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:


tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [11]:
padded_sequences = pad_sequences(sequences, maxlen=50, padding='post')
X = padded_sequences

In [12]:
# X.__len__()
y.__len__()

500

# split data for training, cross eval and testing

In [13]:
X_train, X_, Y_train, Y_  = train_test_split(X, y, test_size=0.3, random_state=1)
X_test, X_val, Y_test, Y_val = train_test_split(X_, Y_, test_size=0.7, random_state=1)
 

In [14]:


# Define the model
model = Sequential()

vocab_size = 10000  
embedding_dim = 16  
max_length = 50     

model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(GlobalAveragePooling1D())  

# Fully connected layers
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))

# Output layer (3 classes for intent)
model.add(Dense(3, activation='softmax'))




In [15]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [16]:
# Summary of the model
model.summary()

In [17]:
# training the model
model.fit(X_train, Y_train, epochs=50, batch_size=10, validation_data=(X_val, Y_val))

Epoch 1/50
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4595 - loss: 1.0920 - val_accuracy: 0.6952 - val_loss: 1.0508
Epoch 2/50
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5952 - loss: 1.0455 - val_accuracy: 0.6952 - val_loss: 0.9733
Epoch 3/50
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5915 - loss: 0.9756 - val_accuracy: 0.6952 - val_loss: 0.8678
Epoch 4/50
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5682 - loss: 0.9164 - val_accuracy: 0.6952 - val_loss: 0.7721
Epoch 5/50
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5707 - loss: 0.8447 - val_accuracy: 0.6952 - val_loss: 0.6659
Epoch 6/50
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5746 - loss: 0.7381 - val_accuracy: 0.8381 - val_loss: 0.5297
Epoch 7/50
[1m35/35[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2b29a423590>

In [18]:
predictions = model.predict(X_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


In [19]:
predictions = np.argmax(predictions, axis=1)

In [20]:
predictions

array([0, 2, 2, 2, 1, 0, 0, 2, 2, 0, 2, 1, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 1, 2,
       2], dtype=int64)

In [21]:
accuracy_score(Y_test, predictions)

1.0