In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,GRU, Embedding,Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the JSON dataset

with open('/content/drive/MyDrive/train.json') as file:
    data = json.load(file)

In [3]:


training_sentences = []  # Agent 1' and 2 messages
training_labels = []     # Sentiments
responses = []           # Agent 2's responses
labels = []              # Unique sentiment labels

for message_id, message_data in data.items():
    for content in message_data['content']:
        agent_message = content['message']
        sentiment = content['sentiment']
        agent = content['agent']
        training_sentences.append(agent_message)
        training_labels.append(sentiment)

        if agent == 'agent_2':
            responses.append(agent_message)

        # Check for unique labels
        if sentiment not in labels:
            labels.append(sentiment)
num_classes = len(labels)

# Now you have training sentences (Agent 1's messages), labels (sentiments), responses (Agent 2's messages), and unique labels in the 'labels' list


In [4]:


training_sentences = []  # Agent 1' and 2 messages
training_labels = []     # Sentiments
responses = []           # Agent 2's responses
labels = []              # Unique sentiment labels

for message_id, message_data in data.items():
    for content in message_data['content']:
        agent_message = content['message']
        sentiment = content['sentiment']
        agent = content['agent']
        training_sentences.append(agent_message)
        training_labels.append(sentiment)

        if agent == 'agent_2':
            responses.append(agent_message)

        # Check for unique labels
        if sentiment not in labels:
            labels.append(sentiment)
num_classes = len(labels)

# Now you have training sentences (Agent 1's messages), labels (sentiments), responses (Agent 2's messages), and unique labels in the 'labels' list


In [5]:
print(len(labels))
print(len(training_sentences))
print(len(training_labels))
print(len(responses))

8
188378
188378
91174


In [6]:
training_sentences=training_sentences[:60000]
training_labels=training_labels[:60000]
responses=responses[:60000]

In [7]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)
training_labels = lbl_encoder.transform(training_labels)

In [8]:
vocab_size = 40000
embedding_dim = 200
max_len = 512
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary data (combining validation and testing)
training_sentences, temp_sentences, training_labels, temp_labels = train_test_split(
    padded_sequences, training_labels, test_size=0.3, random_state=42)

# Split the temporary data into validation and testing
validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(
    temp_sentences, temp_labels, test_size=0.1, random_state=42)




In [10]:
model = Sequential()
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(128, return_sequences=True))  # You can adjust the number of LSTM units as needed
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(512))
model.add(Dense(256, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 200)          8000000   
                                                                 
 lstm (LSTM)                 (None, 512, 128)          168448    
                                                                 
 dropout (Dropout)           (None, 512, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 512, 256)          394240    
                                                                 
 dropout_1 (Dropout)         (None, 512, 256)          0         
                                                                 
 lstm_2 (LSTM)               (None, 512)               1574912   
                                                                 
 dense (Dense)               (None, 256)              

In [11]:
history = model.fit(
    training_sentences, training_labels,
    epochs=10,  # Adjust the number of epochs as needed
    validation_data=(validation_sentences, validation_labels),

)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Evaluate the model on the test data when you're ready
test_loss, test_accuracy = model.evaluate(test_sentences, test_labels)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 42.50%


In [15]:
from sklearn.metrics import classification_report

# Make predictions on the training data
train_predictions = model.predict(training_sentences)

# Convert the predictions from one-hot encoding to class labels
train_predicted_labels = np.argmax(train_predictions, axis=1)

# Generate a classification report for training data
train_class_report = classification_report(training_labels, train_predicted_labels, target_names=labels)

# Print the training classification report
print("Training Classification Report:\n", train_class_report)


Training Classification Report:
                         precision    recall  f1-score   support

Curious to dive deeper       0.79      0.68      0.73       168
                 Happy       0.89      0.95      0.92     17780
               Neutral       0.77      0.80      0.79       276
             Surprised       0.74      0.39      0.51       231
             Disgusted       0.88      0.82      0.85      6630
                   Sad       0.89      0.89      0.89      9394
               Fearful       0.73      0.93      0.82       614
                 Angry       0.95      0.84      0.90      6907

              accuracy                           0.89     42000
             macro avg       0.83      0.79      0.80     42000
          weighted avg       0.90      0.89      0.89     42000



In [14]:
from sklearn.metrics import classification_report

# Make predictions on the test data
predictions = model.predict(test_sentences)

# Convert the predictions from one-hot encoding to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Generate a classification report
class_report = classification_report(test_labels, predicted_labels, target_names=labels)

# Print the classification report
print("Classification Report:\n", class_report)


Classification Report:
                         precision    recall  f1-score   support

Curious to dive deeper       0.00      0.00      0.00         9
                 Happy       0.57      0.59      0.58       752
               Neutral       0.00      0.00      0.00        11
             Surprised       0.00      0.00      0.00        16
             Disgusted       0.28      0.28      0.28       275
                   Sad       0.36      0.39      0.37       419
               Fearful       0.09      0.12      0.11        24
                 Angry       0.33      0.26      0.29       294

              accuracy                           0.42      1800
             macro avg       0.20      0.21      0.20      1800
          weighted avg       0.42      0.42      0.42      1800



In [13]:

# to save the trained model
model.save("chat_model")

import pickle

# to save the fitted tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# to save the fitted label encoder
with open('label_encoder.pickle', 'wb') as ecn_file:
    pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)