In [1]:
!pip install -U sentence-transformers
!pip install datasets scikit-learn

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [5]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import torch
from datasets import Dataset

class EnhancedSBERT:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.kmeans = None
        self.knowledge_base = []

    def fine_tune(self, train_samples, epochs=1, batch_size=16):
        """Fine-tune the model on custom data"""
        train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
        train_loss = losses.CosineSimilarityLoss(self.model)

        self.model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs, warmup_steps=100)

    def encode(self, sentences):
        return self.model.encode(sentences, convert_to_numpy=True)

    def find_optimal_clusters(self, embeddings, max_clusters=10):
        """Find optimal number of clusters using silhouette score"""
        silhouette_scores = []
        for n_clusters in range(2, min(max_clusters + 1, len(embeddings))):
            kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
            cluster_labels = kmeans.fit_predict(embeddings)
            silhouette_scores.append(silhouette_score(embeddings, cluster_labels))

        optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
        return optimal_clusters

    def cluster_sentences(self, sentences, n_clusters=None):
        """Cluster sentences using K-means"""
        embeddings = self.encode(sentences)
        if n_clusters is None:
            n_clusters = self.find_optimal_clusters(embeddings)

        self.kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        return self.kmeans.fit_predict(embeddings)

    def classify_new_sentence(self, sentence):
        """Classify a new sentence into existing clusters"""
        if self.kmeans is None:
            raise ValueError("Must cluster sentences before classification")

        embedding = self.encode([sentence])
        return self.kmeans.predict(embedding)[0]

    def semantic_search(self, query, corpus, top_k=5):
        """Perform semantic search"""
        query_embedding = self.encode([query])
        corpus_embeddings = self.encode(corpus)

        cos_scores = np.dot(query_embedding, corpus_embeddings.T)[0]
        top_results = np.argsort(cos_scores)[::-1][:top_k]

        return [(corpus[idx], cos_scores[idx]) for idx in top_results]

    def add_to_knowledge_base(self, sentences):
        """Add new sentences to the knowledge base"""
        self.knowledge_base.extend(sentences)

    def load_knowledge_from_file(self, file_path):
        """Load knowledge from a text file"""
        with open(file_path, 'r', encoding='utf-8') as file:
            sentences = file.readlines()
        # Remove any leading/trailing whitespace and empty lines
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
        self.add_to_knowledge_base(sentences)

    def chatbot(self):
        """Interactive chatbot using semantic search"""
        print("Chatbot: Hello! I'm here to help. Ask me anything or type 'quit' to exit.")
        while True:
            user_input = input("You: ").strip()
            if user_input.lower() == 'quit':
                print("Chatbot: Goodbye!")
                break

            if not self.knowledge_base:
                print("Chatbot: I'm sorry, but my knowledge base is empty. Please add some information first.")
                continue

            results = self.semantic_search(user_input, self.knowledge_base, top_k=2)

            if results[0][1] < 0.3:  # If the best match has low similarity
                print("Chatbot: I'm not sure about that. Can you please rephrase or ask something else?")
            else:
                print("Chatbot: Based on what I know:")
                for sentence, score in results:
                    print(f"- {sentence} (Confidence: {score:.2f})")
if __name__ == "__main__":
    enhanced_sbert = EnhancedSBERT()

    # Fine-tuning example
    train_examples = [
        InputExample(texts=['The cat sits on the mat', 'There is a cat on the mat'], label=0.8),
        InputExample(texts=['I love machine learning', 'I enjoy artificial intelligence'], label=0.7),
        InputExample(texts=['Python is great for data science', 'R is also used in data analysis'], label=0.6),
        InputExample(texts=['Neural networks are complex', 'Deep learning models can be intricate'], label=0.9),
        InputExample(texts=['Climate change is a global issue', 'Environmental protection is crucial'], label=0.8),
    ]
    enhanced_sbert.fine_tune(train_examples)

    # Expanded clustering example
    sentences = [
        "I love machine learning and artificial intelligence",
        "The cat sits on the mat while the dog sleeps",
        "Python is a great programming language for data science",
        "Deep learning is fascinating and has many applications",
        "Dogs are loyal animals and make great pets",
        "Climate change is affecting global weather patterns",
        "Renewable energy sources are becoming more popular",
        "The stock market fluctuates based on various factors",
        "Healthy eating habits contribute to overall well-being",
        "Space exploration has led to many technological advancements",
        "Virtual reality is changing the gaming industry",
        "Cybersecurity is crucial in the digital age",
        "Artificial intelligence is used in autonomous vehicles",
        "Quantum computing could revolutionize cryptography",
        "Blockchain technology has applications beyond cryptocurrency"
    ]
    clusters = enhanced_sbert.cluster_sentences(sentences)
    print("Clusters:", clusters)

    # Print sentences grouped by cluster
    cluster_dict = {}
    for sentence, cluster in zip(sentences, clusters):
        if cluster not in cluster_dict:
            cluster_dict[cluster] = []
        cluster_dict[cluster].append(sentence)

    for cluster, sentences in cluster_dict.items():
        print(f"\nCluster {cluster}:")
        for sentence in sentences:
            print(f"- {sentence}")

    # Classify new sentences
    new_sentences = [
        "AI is revolutionizing industries across the globe",
        "Sustainable practices are essential for environmental conservation",
        "Big data analytics helps companies make informed decisions"
    ]
    for sentence in new_sentences:
        cluster = enhanced_sbert.classify_new_sentence(sentence)
        print(f"\nNew sentence '{sentence}' belongs to cluster {cluster}")

    # Expanded semantic search example
    corpus = [
        "Machine learning is a subset of artificial intelligence",
        "Neural networks are inspired by the human brain's structure",
        "Data science involves analyzing large datasets to extract insights",
        "Natural language processing deals with the interaction between computers and human language",
        "Computer vision enables machines to interpret and understand visual information from the world",
        "Reinforcement learning is a type of machine learning where agents learn to make decisions",
        "Deep learning uses multiple layers of neural networks to model complex patterns",
        "Artificial intelligence aims to create systems that can perform tasks requiring human intelligence",
        "Supervised learning uses labeled data to train models for prediction or classification",
        "Unsupervised learning finds patterns in data without predetermined labels",
        "Transfer learning applies knowledge from one task to improve performance on a related task",
        "Generative AI can create new content, such as images or text, based on training data",
        "Explainable AI focuses on making machine learning models more interpretable and transparent",
        "Edge computing processes data near the source, reducing latency and bandwidth usage",
        "Quantum machine learning combines quantum computing with machine learning algorithms"
    ]

    queries = [
        "What is artificial intelligence?",
        "How do neural networks work?",
        "What are the applications of machine learning?",
        "Explain the concept of deep learning",
        "What is the difference between supervised and unsupervised learning?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        results = enhanced_sbert.semantic_search(query, corpus, top_k=3)
        print("Semantic search results:")
        for sentence, score in results:
            print(f"- {sentence} (Score: {score:.4f})")

    # Load knowledge from file
    enhanced_sbert.load_knowledge_from_file('knowledge.txt')

    # Run the chatbot
    enhanced_sbert.chatbot()

Step,Training Loss


Clusters: [1 2 1 1 2 0 0 0 0 1 0 1 1 1 1]

Cluster 1:
- I love machine learning and artificial intelligence
- Python is a great programming language for data science
- Deep learning is fascinating and has many applications
- Space exploration has led to many technological advancements
- Cybersecurity is crucial in the digital age
- Artificial intelligence is used in autonomous vehicles
- Quantum computing could revolutionize cryptography
- Blockchain technology has applications beyond cryptocurrency

Cluster 2:
- The cat sits on the mat while the dog sleeps
- Dogs are loyal animals and make great pets

Cluster 0:
- Climate change is affecting global weather patterns
- Renewable energy sources are becoming more popular
- The stock market fluctuates based on various factors
- Healthy eating habits contribute to overall well-being
- Virtual reality is changing the gaming industry

New sentence 'AI is revolutionizing industries across the globe' belongs to cluster 1

New sentence 'Sustaina