In [None]:
!pip install nltk                                

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!mkdir chatbot

In [None]:
%%writefile chatbot/intents.json

{
  "intents": [
    {
      "tag": "salutations", 
      "patterns": [
        "Salut",
        "Hé",
        "Comment allez-vous",
        "Est-ce que quelqu'un est là?",
        "Bonjour",
        "Bonsoir",
        "Bonne journée"
      ],
      "responses": [
        "Hé !",
        "Bonjour, merci de votre visite",
        "Bonjour, que puis-je faire pour vous ?",
        "Bonjour, comment puis-je vous aider ? "
      ]
    },
    {
      "tag": "Au revoir",   
      "patterns": ["Bye", "A plus tard", "Au revoir"],
      "responses": [
       "A plus tard, merci de votre visite",
      " Passez une bonne journée",
      " Au revoir ! Revenez bientôt."
      ]
    },
    {
      "tag": "Merci", 
      "patterns": ["Merci","C'est utile", "Merci beaucoup !"],
      "responses": ["Heureux de vous aider !", "De rien à la prochaine!", "C'est avec plaisir"]
    },
    {
      "tag": "Baccalauréat", 
      "patterns": [
        "Je viens d'obtenir mon baccalauréat",
        "J'ai le bac cette année",
        "Je suis nouveau bachelier"
      ],
      "responses": [ "Toutes mes félicitations ! Vous avez obtenu quel bac (scientifique, littéraire ou technique)"]
    },
    {
      "tag": "Bac scientifique", 
      "patterns": [
        "J'ai le baccalauréat scientifique?",
        "J'ai le bac S?",
        "J'ai le bac S1",
        "J'ai le bac S2"
      ],
      "responses": [
        "Très bien. Puis je connaitre dans quelle matière t'as eu ta meilleure note ?"
      ]
    },
    {
      "tag": "Bac scientifique suite1",
      "patterns": [
        "J'ai une bonne note en Math",
        "J'ai une bonne note en Mathématique",
        "J'ai eu ma meilleure note en Math",
        "J'ai eu ma meilleure note en Physique"
      ],
      "responses": ["Voici les formations post bac qui collent le mieux à votre profil:\nMathématique et Informatique,\nGenie électromécanique,\nGenie civile,\nLes métiers du pétrole et du gaz,\nSystèmes, Réseaux et Télécoms,\nCyber-Sécurité et cryptographie,\nInformatique - Developpement d’applications (web, mobile, gaming, etc.)\nL'admission aux grandes écoles militaires,\nL'admission à l'école nationale de l'aviation civile ...\nVous pourriez néanmoins consulter le site de Campusen(https://orientation.campusen.sn/formations/demandeurs),\npour plus d'information sur les universités et les filières disponibles dans ces dernières."]
    },
    {
      "tag": "Bac scientifique suite2",
      "patterns": [
        "J'ai une bonne note en Science de la vie et de la terre",
        "J'ai eu ma meilleure note en SVT",
        "J'ai une bonne note en chimie",
        "J'ai une bonne note en Physique chimie",
        "J'ai une bonne note en PC et SVT"
      ],
      "responses": ["Voici les formations post bac qui collent le mieux à votre profil:\nMédecine,\nPhysique chimie,\nAgriculture,\nBiologie,\nEcologie\nHydrolique\n\nAgroforesterie,\nAgronomie et Agroalimentaire\nAquaculture\nGenie électromécanique,\nGenie civile,\nEconomie et Gestion\nLes métiers du pétrole et du gaz, ...\nVous pourriez néanmoins consulter le site de Campusen(https://orientation.campusen.sn/formations/demandeurs),\npour plus d'information sur les universités et les filières disponibles dans ces dernières."]
    }

  ]
}


Writing chatbot/intents.json


In [None]:
%%writefile chatbot/nltk_utils.py

import numpy as np
import nltk
# nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
    return nltk.word_tokenize(sentence)


def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())


def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
    """
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

Writing chatbot/nltk_utils.py


In [None]:
%%writefile chatbot/model.py

import torch
import torch.nn as nn


class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size) 
        self.l2 = nn.Linear(hidden_size, hidden_size) 
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        # no activation and no softmax at the end
        return out

Writing chatbot/model.py


In [None]:
%%writefile chatbot/train.py

import numpy as np
import random
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from nltk_utils import bag_of_words, tokenize, stem
from model import NeuralNet

with open('chatbot/intents.json', 'r') as f:
  intents = json.load(f)

all_words = []
tags = []
xy = []
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    tag = intent['tag']
    # add to tag list
    tags.append(tag)
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_words.extend(w)
        # add to xy pair
        xy.append((w, tag))

# stem and lower each word
ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

print(len(xy), "patterns")
print(len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)

# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Hyper-parameters 
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)
print(input_size, output_size)

class ChatDataset(Dataset):

    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NeuralNet(input_size, hidden_size, output_size).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


print(f'final loss: {loss.item():.4f}')

data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

Writing chatbot/train.py


In [None]:
%%writefile chatbot/chat.py

import random
import json

import torch

from model import NeuralNet
from nltk_utils import bag_of_words, tokenize

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open('chatbot/intents.json', 'r') as json_data:
	intents = json.load(json_data)

FILE = "data.pth"
data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Chat"
print("Let's chat! (type 'quit' to exit)")
while True:
    # sentence = "do you use credit cards?"
    sentence = input("You: ")
    if sentence == "quit":
        break

    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.75:
        for intent in intents['intents']:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")
    else:
        print(f"{bot_name}: I do not understand...")

Writing chatbot/chat0.py


In [None]:
!python chatbot/train.py

29 patterns
7 tags: ['Au revoir', 'Bac scientifique', 'Bac scientifique suite1', 'Bac scientifique suite2', 'Baccalauréat', 'Merci', 'salutations']
58 unique stemmed words: ['a', 'allez-v', 'anné', 'au', 'bac', 'baccalauréat', 'bacheli', 'beaucoup', 'bonjour', 'bonn', 'bonsoir', 'bye', "c'est", 'cett', 'chimi', 'comment', "d'obtenir", 'de', 'en', 'est', 'est-c', 'et', 'eu', 'hé', "j'ai", 'je', 'journé', 'la', 'le', 'là', 'ma', 'math', 'mathématiqu', 'meilleur', 'merci', 'mon', 'note', 'nouveau', 'pc', 'physiqu', 'plu', 'que', "quelqu'un", 'revoir', 's', 's1', 's2', 'salut', 'scienc', 'scientifiqu', 'sui', 'svt', 'tard', 'terr', 'une', 'util', 'vie', 'vien']
58 7
Epoch [100/1000], Loss: 0.9135
Epoch [200/1000], Loss: 0.1330
Epoch [300/1000], Loss: 0.0172
Epoch [400/1000], Loss: 0.0082
Epoch [500/1000], Loss: 0.0046
Epoch [600/1000], Loss: 0.0031
Epoch [700/1000], Loss: 0.0024
Epoch [800/1000], Loss: 0.0004
Epoch [900/1000], Loss: 0.0015
Epoch [1000/1000], Loss: 0.0005
final loss: 0.0005

In [None]:
!python chatbot/chat.py

Let's chat! (type 'quit' to exit)
You: quit
