In [None]:
import os
import json
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import hdbscan
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Dict, List  # Ensure this is imported


# Step 1: Load the Qwen Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

# Step 2: Load and Chunk Dataset
def load_and_chunk_datasets(folder_path: str) -> pd.DataFrame:
    """
    Load all JSON datasets and chunk data for embedding and clustering.
    """
    data = []
    for file in os.listdir(folder_path):
        if file.endswith('.json'):
            with open(os.path.join(folder_path, file), 'r') as f:
                content = json.load(f)
                name = content.get("name", "")
                uses = content.get("uses", "")
                side_effects = content.get("side_effects", "")
                dosage = content.get("dosage", "")

                # Chunk into smaller pieces
                chunks = [
                    {"text": f"Name: {name}. Uses: {uses}", "source": file},
                    {"text": f"Side Effects: {side_effects}", "source": file},
                    {"text": f"Dosage: {dosage}", "source": file},
                ]
                data.extend(chunks)
    return pd.DataFrame(data)

# Load and chunk dataset
dataset_path = '/kaggle/input/random/datasets/microlabs_usa'
pharma_data = load_and_chunk_datasets(dataset_path)

# Step 3: Generate BERT Embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
pharma_data['embedding'] = pharma_data['text'].apply(lambda x: bert_model.encode(x))

# Step 4: Clustering with HDBSCAN
embeddings = list(pharma_data['embedding'])
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
pharma_data['cluster'] = clusterer.fit_predict(embeddings)

# Step 5: Calculate Cluster Centroids
cluster_centroids = (
    pharma_data.groupby('cluster')['embedding']
    .apply(lambda x: sum(x) / len(x))
    .to_dict()
)

# Step 6: Nearest Neighbor Search
# Fit NearestNeighbors for retrieval within clusters
clustered_data = pharma_data[pharma_data['cluster'] != -1]  # Exclude noise
nn_models = {}
for cluster_id in clustered_data['cluster'].unique():
    cluster_subset = clustered_data[clustered_data['cluster'] == cluster_id]
    nn = NearestNeighbors(n_neighbors=5, metric='euclidean')
    nn.fit(list(cluster_subset['embedding']))
    nn_models[cluster_id] = (nn, cluster_subset)

# Step 7: RAG-Powered Assistant
class PharmaKnowledgeAssistant:
    def _init_(self, data: pd.DataFrame, nn_models: Dict, bert_model, cluster_centroids, tokenizer, model):
        self.data = data
        self.nn_models = nn_models
        self.bert_model = bert_model
        self.cluster_centroids = cluster_centroids
        self.tokenizer = tokenizer
        self.model = model

    def retrieve_relevant_chunks(self, query: str) -> List[Dict]:
        """
        Retrieve relevant chunks using BERT embeddings and nearest neighbor search.
        """
        query_embedding = self.bert_model.encode(query)

        # Find the nearest cluster centroid
        cluster_distances = {
            cluster_id: sum((query_embedding - centroid) ** 2)
            for cluster_id, centroid in self.cluster_centroids.items()
        }
        best_cluster = min(cluster_distances, key=cluster_distances.get)

        # Retrieve nearest neighbors within the best cluster
        nn, cluster_subset = self.nn_models[best_cluster]
        distances, indices = nn.kneighbors([query_embedding])
        relevant_chunks = cluster_subset.iloc[indices[0]].to_dict(orient='records')
        return relevant_chunks

    def generate_response(self, query: str, context_chunks: List[Dict]) -> str:
        """
        Generate a response using the Qwen 2.5-1.5B-Instruct model, augmented with retrieved context.
        """
        context_text = "\n".join([chunk['text'] for chunk in context_chunks])
        prompt = (
            f"Context:\n{context_text}\n\n"
            f"User Query:\n{query}\n\n"
            "Answer:"
        )
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt")
            outputs = self.model.generate(inputs.input_ids, max_length=200, num_return_sequences=1)
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            return f"Error generating response: {str(e)}"

    def recommend_safe_medication(self, symptom: str, condition: str) -> str:
        """
        Generate medication recommendations based on symptoms and conditions dynamically.
        """
        # Filter the dataset for medications relevant to the symptom and condition
        relevant_data = self.data[
            self.data['text'].str.contains(symptom, case=False) &
            self.data['text'].str.contains(condition, case=False)
        ]

        if relevant_data.empty:
            return "No suitable medication found for the given symptoms and conditions."

        # If relevant data is found, generate a list of medications
        recommended_medications = []
        for _, row in relevant_data.iterrows():
            text = row['text']
            source = row['source']
            recommended_medications.append(f"Product from {source}: {text}")

        return "\n".join(recommended_medications)

    def answer_query(self, query: str) -> str:
        """
        Answer the user's query using RAG or external generation.
        """
        if "symptom:" in query.lower() and "condition:" in query.lower():
            # Handle symptom-condition recommendation
            parts = query.split("condition:")
            symptom = parts[0].replace("symptom:", "").strip()
            condition = parts[1].strip()
            return self.recommend_safe_medication(symptom, condition)

        # Retrieve relevant chunks for general queries
        relevant_chunks = self.retrieve_relevant_chunks(query)
        return self.generate_response(query, relevant_chunks)

# Step 8: Interactive Assistant
def interactive_assistant():
    """
    Main interactive loop for the assistant.
    """
    assistant = PharmaKnowledgeAssistant(pharma_data, nn_models, bert_model, cluster_centroids, tokenizer, model)
    print("Welcome to the Pharma Knowledge Assistant! Type 'exit' to quit.")

    while True:
        query = input("Your Query: ")
        if query.lower() == 'exit':
            print("Goodbye!")
            break

        response = assistant.answer_query(query)
        print("\nAssistant Response:")
        print(response)

# Run the assistant
interactive_assistant()