In [1]:
# ChatBot
from nltk.tokenize import word_tokenize

# Step 1: Define intents and responses
intents = {
    "greeting": ["hello", "hi", "hey"],
    "opening_hours": ["open", "opening", "hours"],
    "pricing": ["price", "cost", "how much"]
}

responses = {
    "greeting": "Hello! How can I help you today?",
    "opening_hours": "We are open from 9 AM to 6 PM, Monday to Friday.",
    "pricing": "Our pricing starts at $10 per month.",
    "default": "Sorry, I didn't understand that. Can you please rephrase?"
}

# Step 2: Build the chatbot
def rule_based_chatbot(user_input):
    tokens = word_tokenize(user_input.lower())

    for intent, keywords in intents.items():
        if any(keyword in tokens for keyword in keywords):
            return responses[intent]

    return responses["default"]

# Step 3: Test the chatbot
print(rule_based_chatbot("What are your opening hours?"))  
print(rule_based_chatbot("How much does it cost?"))      
print(rule_based_chatbot("Tell me a joke"))  

We are open from 9 AM to 6 PM, Monday to Friday.
Our pricing starts at $10 per month.
Sorry, I didn't understand that. Can you please rephrase?


In [2]:
# Modèle de classification - LinearSVC
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# 1. Données d'entraînement (phrases et intentions)
training_data = [
    ("hello", "greeting"),
    ("hi", "greeting"),
    ("hey", "greeting"),
    ("good morning", "greeting"),
    ("what are your opening hours?", "opening_hours"),
    ("when do you open?", "opening_hours"),
    ("what time do you close?", "opening_hours"),
    ("how much does it cost?", "pricing"),
    ("what's the price?", "pricing"),
    ("how expensive is it?", "pricing"),
    ("give me the pricing details", "pricing"),
]

# Séparation des phrases et labels
texts, labels = zip(*training_data)

# 2. Vectorisation des phrases
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# 3. Entraînement du modèle LinearSVC
classifier = LinearSVC()
classifier.fit(X, labels)

# 4. Réponses associées aux intentions
responses = {
    "greeting": "Hello! How can I help you today?",
    "opening_hours": "We are open from 9 AM to 6 PM, Monday to Friday.",
    "pricing": "Our pricing starts at $10 per month.",
    "default": "Sorry, I didn't understand that. Can you please rephrase?"
}

# 5. Fonction de prédiction et réponse
def chatbot_response(user_input):
    user_vector = vectorizer.transform([user_input])
    predicted_intent = classifier.predict(user_vector)[0]
    return responses.get(predicted_intent, responses["default"])

# 6. Tests
print(chatbot_response("What are your opening hours?"))
print(chatbot_response("How much does it cost?"))
print(chatbot_response("Tell me a joke"))  


We are open from 9 AM to 6 PM, Monday to Friday.
Our pricing starts at $10 per month.
Our pricing starts at $10 per month.


In [None]:
# Transformer (+utiliser le modèle qui dèjà existe)
!pip install transformers datasets torch nltk scikit-learn

import torch
import nltk
from nltk.tokenize import word_tokenize
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 1. Données d'entraînement (phrases et intentions)
training_data = [
    ("hello", "greeting"),
    ("hi", "greeting"),
    ("hey", "greeting"),
    ("good morning", "greeting"),
    ("what are your opening hours?", "opening_hours"),
    ("when do you open?", "opening_hours"),
    ("what time do you close?", "opening_hours"),
    ("how much does it cost?", "pricing"),
    ("what's the price?", "pricing"),
    ("how expensive is it?", "pricing"),
    ("give me the pricing details", "pricing"),
]

# Séparation des phrases et labels
texts, labels = zip(*training_data)

# Encodage des labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_labels = len(label_encoder.classes_)

# 2. Chargement du tokenizer DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# 3. Création du dataset personnalisé pour PyTorch
class IntentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

dataset = IntentDataset(texts, labels_encoded)

# 4. Chargement du modèle DistilBERT pour la classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# 5. Entraînement du modèle avec Hugging Face Trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# 6. Fonction de prédiction et de réponse
responses = {
    "greeting": "Hello! How can I help you today?",
    "opening_hours": "We are open from 9 AM to 6 PM, Monday to Friday.",
    "pricing": "Our pricing starts at $10 per month.",
    "default": "Sorry, I didn't understand that. Can you please rephrase?"
}

def chatbot_response(user_input):
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits).item()
    predicted_intent = label_encoder.inverse_transform([predicted_class])[0]
    
    return responses.get(predicted_intent, responses["default"])

# 7. Tests
print(chatbot_response("What are your opening hours?"))
print(chatbot_response("How much does it cost?"))
print(chatbot_response("Tell me a joke"))  

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [4]:
# Markov chain 
from collections import defaultdict, Counter
import random

# Step 1: Preprocess text and build transition matrix
def build_markov_chain(text, n=2):
    words = text.split()
    transitions = defaultdict(Counter)

    for i in range(len(words) - n):
        current_state = tuple(words[i:i+n])
        next_state = words[i+n]
        transitions[current_state][next_state] += 1

    # Convert counts to probabilities
    for current_state, next_states in transitions.items():
        total = sum(next_states.values())
        for next_state in next_states:
            next_states[next_state] /= total

    return transitions

In [6]:
# Step 1: Build the Markov Chain
def build_markov_chain(text, n=2):
    words = text.split()
    markov_chain = defaultdict(lambda: defaultdict(int))
    
    for i in range(len(words) - n):
        state = tuple(words[i:i + n])
        next_word = words[i + n]
        markov_chain[state][next_word] += 1
    
    return markov_chain

# Step 2: Generate text using the Markov Chain
def generate_text(markov_chain, start_state, length=10, n=2):  # Ajout de `n`
    current_state = start_state
    text = list(current_state)

    for _ in range(length - n):  # Ajustement ici aussi
        if current_state not in markov_chain:
            break
        next_word = random.choices(
            list(markov_chain[current_state].keys()),
            list(markov_chain[current_state].values())
        )[0]
        text.append(next_word)
        current_state = tuple(text[-n:])  # Maintenant, `n` est bien défini

    return ' '.join(text)

# Example usage
text = "The cat sat on the mat. The cat was happy. The dog was jealous."
markov_chain = build_markov_chain(text, n=2)
start_state = ("The", "cat")
generated_text = generate_text(markov_chain, start_state, length=10, n=2)  # Ajout de `n`
print(generated_text)

The cat was happy. The dog was jealous.
