In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,GRU, Embedding,Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the JSON dataset

with open('/content/drive/MyDrive/train.json') as file:
    data = json.load(file)

In [None]:


training_sentences = []  # Agent 1' and 2 messages
training_labels = []     # Sentiments
responses = []           # Agent 2's responses
labels = []              # Unique sentiment labels

for message_id, message_data in data.items():
    for content in message_data['content']:
        agent_message = content['message']
        sentiment = content['sentiment']
        agent = content['agent']
        training_sentences.append(agent_message)
        training_labels.append(sentiment)

        if agent == 'agent_2':
            responses.append(agent_message)

        # Check for unique labels
        if sentiment not in labels:
            labels.append(sentiment)
num_classes = len(labels)

# Now you have training sentences (Agent 1's messages), labels (sentiments), responses (Agent 2's messages), and unique labels in the 'labels' list


In [None]:
print(len(labels))
print(len(training_sentences))
print(len(training_labels))
print(len(responses))

8
188378
188378
91174


In [None]:
training_sentences=training_sentences[:60000]
training_labels=training_labels[:60000]
responses=responses[:60000]

In [None]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)
training_labels = lbl_encoder.transform(training_labels)

In [None]:
vocab_size = 40000
embedding_dim = 200
max_len = 512
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary data (combining validation and testing)
training_sentences, temp_sentences, training_labels, temp_labels = train_test_split(
    padded_sequences, training_labels, test_size=0.3, random_state=42)

# Split the temporary data into validation and testing
validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(
    temp_sentences, temp_labels, test_size=0.1, random_state=42)




In [None]:
model = Sequential()
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(128, return_sequences=True))  # You can adjust the number of LSTM units as needed
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(512))
model.add(Dense(256, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 200)          8000000   
                                                                 
 lstm (LSTM)                 (None, 512, 128)          168448    
                                                                 
 dropout (Dropout)           (None, 512, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 512, 256)          394240    
                                                                 
 dropout_1 (Dropout)         (None, 512, 256)          0         
                                                                 
 lstm_2 (LSTM)               (None, 512)               1574912   
                                                                 
 dense (Dense)               (None, 256)              

In [None]:
history = model.fit(
    training_sentences, training_labels,
    epochs=5,  # Adjust the number of epochs as needed
    validation_data=(validation_sentences, validation_labels),

)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluate the model on the test data when you're ready
test_loss, test_accuracy = model.evaluate(test_sentences, test_labels)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 45.89%


In [None]:
from sklearn.metrics import classification_report

# Make predictions on the training data
train_predictions = model.predict(training_sentences)

# Convert the predictions from one-hot encoding to class labels
train_predicted_labels = np.argmax(train_predictions, axis=1)

# Generate a classification report for training data
train_class_report = classification_report(training_labels, train_predicted_labels, target_names=labels)

# Print the training classification report
print("Training Classification Report:\n", train_class_report)


Training Classification Report:
                         precision    recall  f1-score   support

Curious to dive deeper       0.00      0.00      0.00       168
                 Happy       0.75      0.93      0.83     17780
               Neutral       0.50      0.00      0.01       276
             Surprised       0.00      0.00      0.00       231
             Disgusted       0.75      0.67      0.71      6630
                   Sad       0.86      0.68      0.76      9394
               Fearful       0.48      0.65      0.55       614
                 Angry       0.82      0.70      0.75      6907

              accuracy                           0.77     42000
             macro avg       0.52      0.45      0.45     42000
          weighted avg       0.77      0.77      0.76     42000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test data
predictions = model.predict(test_sentences)

# Convert the predictions from one-hot encoding to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Generate a classification report
class_report = classification_report(test_labels, predicted_labels, target_names=labels)

# Print the classification report
print("Classification Report:\n", class_report)


Classification Report:
                         precision    recall  f1-score   support

Curious to dive deeper       0.00      0.00      0.00         9
                 Happy       0.53      0.72      0.61       752
               Neutral       0.00      0.00      0.00        11
             Surprised       0.00      0.00      0.00        16
             Disgusted       0.33      0.30      0.31       275
                   Sad       0.42      0.31      0.36       419
               Fearful       0.20      0.12      0.15        24
                 Angry       0.32      0.24      0.28       294

              accuracy                           0.46      1800
             macro avg       0.23      0.21      0.21      1800
          weighted avg       0.43      0.46      0.43      1800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

# to save the trained model
model.save("chat_model")

import pickle

# to save the fitted tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# to save the fitted label encoder
with open('label_encoder.pickle', 'wb') as ecn_file:
    pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
pip install colorama

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [33]:
import json
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder

import colorama
colorama.init()
from colorama import Fore, Style, Back

import random
import pickle



def chat():
    # load trained model
    model = keras.models.load_model('chat_model')

    # load tokenizer object
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    # load label encoder object
    with open('label_encoder.pickle', 'rb') as enc:
        lbl_encoder = pickle.load(enc)

    # parameters
    max_len =512

    while True:
        print(Fore.LIGHTBLUE_EX + "User: " + Style.RESET_ALL, end="")
        inp = input()
        if inp.lower() == "quit":
            break

        result = model.predict(keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences([inp]),
                                             truncating='post', maxlen=max_len))
        sentiment = lbl_encoder.inverse_transform([np.argmax(result)])

        # Initialize a variable to track whether a response has been generated
        response_generated = False

        for id, message_data in data.items():
            for i in message_data['content']:
                if i['sentiment'] == sentiment:
                    print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL, np.random.choice(responses))
                    response_generated = True
                    break  # Exit the inner loop once a response is generated

            if response_generated:
                break  # Exit the outer loop if a response is generated

          # print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL,random.choice(responses))

print(Fore.YELLOW + "Start messaging with the bot (type quit to stop)!" + Style.RESET_ALL)
chat()

Start messaging with the bot (type quit to stop)!
User: hello
ChatBot: Have a good one!
User: i am huge fun of
ChatBot: yup, the plot is dumb and they ruined luke as a character, I also don't know why they translated star wars into navajo in 2013 that seems like a limited audience haha
User: mircrosoft
ChatBot: That was kind of them.  Have you followed Pokemon?  They have enough episodes to last for a couple years if you watch an episode every day.  They have 750 in fact!
User: how are you
ChatBot: She was not at fault.  In general I would not want to buy my kids fancy shoes or cars that would attract attention from bad people.
User: She was not at fault
ChatBot: Nokie started out playing a Fender Telecaster then switched to a Mosrite.  Fender is a prominent guitar maker.  I have not heard of Mosrite.  Have you?
User: quit
