In [1]:
!pip install -U sentence-transformers
!pip install datasets scikit-learn



In [4]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import torch
from datasets import Dataset

class EnhancedSBERT:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.kmeans = None
        self.knowledge_base = []

    def fine_tune(self, train_samples, epochs=1, batch_size=16):
        """Fine-tune the model on custom data"""
        train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
        train_loss = losses.CosineSimilarityLoss(self.model)

        self.model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs, warmup_steps=100)

    def encode(self, sentences):
        return self.model.encode(sentences, convert_to_numpy=True)

    def find_optimal_clusters(self, embeddings, max_clusters=10):
        """Find optimal number of clusters using silhouette score"""
        silhouette_scores = []
        for n_clusters in range(2, min(max_clusters + 1, len(embeddings))):
            kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
            cluster_labels = kmeans.fit_predict(embeddings)
            silhouette_scores.append(silhouette_score(embeddings, cluster_labels))

        optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
        return optimal_clusters

    def cluster_sentences(self, sentences, n_clusters=None):
        """Cluster sentences using K-means"""
        embeddings = self.encode(sentences)
        if n_clusters is None:
            n_clusters = self.find_optimal_clusters(embeddings)

        self.kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        return self.kmeans.fit_predict(embeddings)

    def classify_new_sentence(self, sentence):
        """Classify a new sentence into existing clusters"""
        if self.kmeans is None:
            raise ValueError("Must cluster sentences before classification")

        embedding = self.encode([sentence])
        return self.kmeans.predict(embedding)[0]

    def semantic_search(self, query, corpus, top_k=5):
        """Perform semantic search"""
        query_embedding = self.encode([query])
        corpus_embeddings = self.encode(corpus)

        cos_scores = np.dot(query_embedding, corpus_embeddings.T)[0]
        top_results = np.argsort(cos_scores)[::-1][:top_k]

        return [(corpus[idx], cos_scores[idx]) for idx in top_results]

    def add_to_knowledge_base(self, sentences):
        """Add new sentences to the knowledge base"""
        self.knowledge_base.extend(sentences)

    def chatbot(self):
        """Interactive chatbot using semantic search"""
        print("Chatbot: Hello! I'm here to help. Ask me anything or type 'quit' to exit.")
        while True:
            user_input = input("You: ").strip()
            if user_input.lower() == 'quit':
                print("Chatbot: Goodbye!")
                break

            if not self.knowledge_base:
                print("Chatbot: I'm sorry, but my knowledge base is empty. Please add some information first.")
                continue

            results = self.semantic_search(user_input, self.knowledge_base, top_k=2)

            if results[0][1] < 0.3:  # If the best match has low similarity
                print("Chatbot: I'm not sure about that. Can you please rephrase or ask something else?")
            else:
                print("Chatbot: Based on what I know:")
                for sentence, score in results:
                    print(f"- {sentence} (Confidence: {score:.2f})")

# Example usage
enhanced_sbert = EnhancedSBERT()

# Fine-tuning example
train_examples = [
    InputExample(texts=['The cat sits on the mat', 'There is a cat on the mat'], label=0.8),
    InputExample(texts=['I love machine learning', 'I enjoy artificial intelligence'], label=0.7),
    # Add more examples...
]
enhanced_sbert.fine_tune(train_examples)

# Clustering example
sentences = [
    "I love machine learning",
    "The cat sits on the mat",
    "Python is a great programming language",
    "Deep learning is fascinating",
    "Dogs are loyal animals"
]
clusters = enhanced_sbert.cluster_sentences(sentences)
print("Clusters:", clusters)

# Classify new sentence
new_sentence = "AI is revolutionizing industries"
cluster = enhanced_sbert.classify_new_sentence(new_sentence)
print(f"New sentence '{new_sentence}' belongs to cluster {cluster}")

# Semantic search example
corpus = [
    "Machine learning is a subset of AI",
    "Neural networks are inspired by the human brain",
    "Data science involves analyzing large datasets",
    "Natural language processing deals with text data"
]
query = "What is artificial intelligence?"
results = enhanced_sbert.semantic_search(query, corpus)
print("Semantic search results:")
for sentence, score in results:
    print(f"{sentence}: {score:.4f}")

# Add knowledge to the chatbot
knowledge = [
    "Machine learning is a subset of artificial intelligence.",
    "Python is a popular programming language for data science.",
    "Neural networks are inspired by the human brain.",
    "Deep learning is a subset of machine learning.",
    "Natural language processing deals with the interaction between computers and human language.",
    "Reinforcement learning is learning through interaction with an environment.",
    "Computer vision is a field of AI that trains computers to interpret visual world.",
    "The Turing test is a test of a machine's ability to exhibit intelligent behavior.",
    "Big data refers to extremely large datasets that may be analyzed computationally.",
    "Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs."
]
enhanced_sbert.add_to_knowledge_base(knowledge)

# Run the chatbot
enhanced_sbert.chatbot()

Step,Training Loss


Clusters: [1 0 1 1 0]
New sentence 'AI is revolutionizing industries' belongs to cluster 1
Semantic search results:
Machine learning is a subset of AI: 0.5993
Neural networks are inspired by the human brain: 0.4028
Data science involves analyzing large datasets: 0.2497
Natural language processing deals with text data: 0.2215
Chatbot: Hello! I'm here to help. Ask me anything or type 'quit' to exit.
You: tell me about artificial intelligence
Chatbot: Based on what I know:
- Machine learning is a subset of artificial intelligence. (Confidence: 0.61)
- Computer vision is a field of AI that trains computers to interpret visual world. (Confidence: 0.56)
You: what do you know about reinforcement learning
Chatbot: Based on what I know:
- Reinforcement learning is learning through interaction with an environment. (Confidence: 0.68)
- Machine learning is a subset of artificial intelligence. (Confidence: 0.41)
You: quit
Chatbot: Goodbye!
