In [None]:
!pip install cassandra-driver


Combining Data for Preprocessing

In [None]:
pip install ace-tools

In [None]:
import pandas as pd

# Define file paths for existing CSVs
#base_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/"  # Update to your directory
import pandas as pd

# Load the individual CSV files
instructions_df = pd.read_csv("/Users/bhavikpatel/Desktop/RAG/RAG_Data/instructions.csv")
html_components_df = pd.read_csv("/Users/bhavikpatel/Desktop/RAG/RAG_Data/html_components.csv")
css_styles_df = pd.read_csv("/Users/bhavikpatel/Desktop/RAG/RAG_Data/css_styles.csv")

# Create a list to hold the unified rows
unified_data = []

# Add Instructions to the unified data
for _, row in instructions_df.iterrows():
    unified_data.append({
        "ID": f"Instruction-{row['instruction_id']}",
        "Source_Type": "Instruction",
        "Source_ID": row["instruction_id"],
        "Text_Representation": row["text"]
    })

# Add HTML Components to the unified data
for _, row in html_components_df.iterrows():
    text_representation = f"Name: {row['name']}, Attributes: {row['attributes']}, Context: {row['component_context']}."
    unified_data.append({
        "ID": f"HTML-{row['component_id']}",
        "Source_Type": "HTML_Component",
        "Source_ID": row["component_id"],
        "Text_Representation": text_representation
    })

# Add CSS Styles to the unified data
for _, row in css_styles_df.iterrows():
    text_representation = (f"Selector: {row['selector']}, Properties: {row['properties']}, "
                           f"Context: {row['description']}.")
    unified_data.append({
        "ID": f"CSS-{row['style_id']}",
        "Source_Type": "CSS_Style",
        "Source_ID": row["style_id"],
        "Text_Representation": text_representation
    })

# Convert the unified data to a DataFrame
unified_df = pd.DataFrame(unified_data)

# Save the unified DataFrame to a CSV file
output_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/unified_data.csv"
unified_df.to_csv(output_path, index=False)

# Display the first few rows of the dataframe to verify the content
print("Unified Data for Embedding:")
print(unified_df.head())

# Confirm the path of the saved file
print(f"Data saved to {output_path}")


In [None]:
import pandas as pd

# Load the unified CSV file
file_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/unified_data.csv"  # Update this path to the actual location of the file
try:
    unified_df = pd.read_csv(file_path)
    print("Unified CSV loaded successfully. Here's a preview:")
    display(unified_df.head())
    print("\nColumn Names:")
    print(unified_df.columns.tolist())
except Exception as e:
    print(f"Error loading file: {e}")


Creating Embedding

In [None]:
pip install sentence-transformers pandas


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np


In [None]:
# Path to the structured CSV file
file_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/unified_data.csv"

# Load the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Display a snippet of the data
print(data.head())


Load SBERT model

In [None]:
# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Verify model is loaded
print("Model loaded successfully!")


Generating Embeddings

In [None]:
# Generate embeddings for the 'Text_Representation' column
data['Embedding'] = data['Text_Representation'].apply(lambda x: model.encode(str(x)).tolist())

# Confirm embeddings are generated
print("Embeddings generated successfully!")


Save the file

In [None]:
# Save the DataFrame with embeddings to a new CSV file
output_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/unified_data_with_embeddings.csv"
data.to_csv(output_path, index=False)

print(f"Data with embeddings saved to: {output_path}")


Test the embedding

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Path to the CSV file with embeddings
file_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/unified_data_with_embeddings.csv"

# Load the data
data = pd.read_csv(file_path)

# Convert the 'Embedding' column from string to list
data['Embedding'] = data['Embedding'].apply(lambda x: np.array(eval(x)))

# Display a snippet of the data
print(data.head())

# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define a sample query
query = "ecommerce Generate a responsive navigation bar with call to action button"

# Generate embedding for the query
query_embedding = model.encode(query).reshape(1, -1)

# Confirm the query embedding
print("Query embedding generated successfully!")

# Extract embeddings from the data
embeddings = np.vstack(data['Embedding'])

# Compute cosine similarities between the query and all embeddings
similarities = cosine_similarity(query_embedding, embeddings).flatten()

# Add the similarity scores to the DataFrame
data['Similarity'] = similarities

# Display the top 5 similar records
top_matches = data.sort_values(by='Similarity', ascending=False).head(5)

print("Top Matches:")
print(top_matches[['ID', 'Text_Representation', 'Similarity']])


# Creating connection and loading files in chunks for vectorization in AstraDB

In [None]:
!pip install cassandra-driver



In [None]:
pip install --upgrade astrapy

#add client below

In [None]:
from astrapy import DataAPIClient

# Initialize the client
client = DataAPIClient("")
db = client.get_database_by_api_endpoint(
  "https://0b0963cb-6b3a-4145-86c2-be7762e6cd9c-westus3.apps.astra.datastax.com"
)

print(f"Connected to Astra DB: {db.list_collection_names()}")

### DELETE previous Chunks

In [None]:
from astrapy import DataAPIClient

# Initialize the DataAPIClient
client = DataAPIClient("your client")
db = client.get_database_by_api_endpoint(
    "https://0b0963cb-6b3a-4145-86c2-be7762e6cd9c-westus3.apps.astra.datastax.com"
)
collection_name = "ssai_vectordb"

# Retrieve the collection
collection = db.get_collection(collection_name)

# Function to delete all documents in the collection
def delete_all_documents(collection):
    try:
        # Use delete_many with an empty filter to delete all documents
        result = collection.delete_many({})
        print(f"All documents deleted successfully! Deleted count: {result.deleted_count}")
    except Exception as e:
        print(f"Error while deleting documents: {e}")

# Delete all documents
delete_all_documents(collection)


### Add Chunk wise data

In [None]:
import pandas as pd
import time
from astrapy import DataAPIClient

# Initialize the DataAPIClient
client = DataAPIClient("your client")
db = client.get_database_by_api_endpoint(
    "https://0b0963cb-6b3a-4145-86c2-be7762e6cd9c-westus3.apps.astra.datastax.com"
)

# Define your collection name and file path
collection_name = "ssai_vectordb"
file_path = "/Users/bhavikpatel/Desktop/RAG/RAG_Data/unified_data_with_embeddings.csv"
chunk_size = 500  # Reduce the number of rows per chunk to avoid server overload

# Retrieve the collection object
collection = db.get_collection(collection_name)

# Function to upload a chunk to Astra DB with retry logic
def upload_chunk_to_astra(chunk, collection):
    documents = []
    for _, row in chunk.iterrows():
        try:
            # Prepare the document to upload
            document = {
                "id": row["ID"],  # Primary key
                "source_type": row["Source_Type"],
                "source_id": row["Source_ID"],
                "text_representation": row["Text_Representation"],
                "embedding": eval(row["Embedding"])  # Ensure embeddings are properly formatted (e.g., lists)
            }
            documents.append(document)
        except Exception as e:
            print(f"Error processing row ID {row['ID']}: {e}")

    # Insert documents in bulk with retry logic
    max_retries = 3
    for attempt in range(max_retries):
        try:
            if documents:
                collection.insert_many(documents)
                print(f"Uploaded {len(documents)} documents successfully.")
            break  # Exit retry loop if successful
        except Exception as e:
            print(f"Error during upload attempt {attempt + 1}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)  # Wait before retrying
            else:
                print("Max retries reached. Skipping this chunk.")

# Read the CSV in chunks and upload
for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
    print(f"Uploading Chunk {i+1}...")
    upload_chunk_to_astra(chunk, collection)
    print(f"Chunk {i+1} uploaded successfully. Waiting before next chunk...")
    time.sleep(1)  # Add delay between chunks


### Access the AstraDB data in VS Code

In [None]:
pip install astrapy pandas


In [None]:
from astrapy import DataAPIClient

# Initialize the DataAPIClient
client = DataAPIClient("your client")
db = client.get_database_by_api_endpoint(
    "https://0b0963cb-6b3a-4145-86c2-be7762e6cd9c-westus3.apps.astra.datastax.com"
)

# Retrieve the collection
collection_name = "ssai_vectordb"
collection = db.get_collection(collection_name)


In [None]:
# Function to fetch documents in chunks
def fetch_all_documents_in_chunks(collection, chunk_size=100):
    documents = []
    offset = 0
    try:
        while True:
            chunk = list(collection.find({}).limit(chunk_size).skip(offset))
            if not chunk:
                break
            documents.extend(chunk)
            offset += chunk_size
        print(f"Retrieved {len(documents)} documents successfully!")
        return documents
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Fetch documents
documents = fetch_all_documents_in_chunks(collection)

if documents:
    print("Sample Document:", documents[0])

# Validate embeddings
for doc in documents:
    if not isinstance(doc.get("embedding"), list):
        print(f"Invalid embedding format for document ID {doc['id']}")
# Display the first document (for testing)




### Test The data by retriving based on similarity.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

# Import DataAPIClient
from astrapy import DataAPIClient

# Initialize the DataAPIClient
client = DataAPIClient("your client")
db = client.get_database_by_api_endpoint(
    "https://0b0963cb-6b3a-4145-86c2-be7762e6cd9c-westus3.apps.astra.datastax.com"
)

# Define your collection name
collection_name = "ssai_vectordb"
collection = db.get_collection(collection_name)

# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight SBERT model

#Function to retrieve similar documents
def retrieve_similar_documents(query_text, documents, top_k=3):
    try:
        # Embed the query
        query_embedding = sbert_model.encode([query_text], clean_up_tokenization_spaces=False)[0]

        # Extract embeddings and document IDs
        embeddings = np.array([doc["embedding"] for doc in documents])
        ids = [doc["id"] for doc in documents]
        texts = [doc["text_representation"] for doc in documents]

        # Compute cosine similarity
        similarities = cosine_similarity([query_embedding], embeddings).flatten()

        # Get top K most similar documents
        top_indices = similarities.argsort()[-top_k:][::-1]
        results = [{"id": ids[i], "similarity": similarities[i], "text_representation": texts[i]} for i in top_indices]

        return results
    except Exception as e:
        print(f"Error during retrieval: {e}")
        return []

# Sample query text
query_text = "Design a modern header for a travel website with a minimalist layout."

# Retrieve top 5 similar documents
results = retrieve_similar_documents(query_text, documents, top_k=5)

# Display the results
print("Top Similar Documents:")
for result in results:
    print(f"\nID: {result['id']}")
    print(f"Similarity: {result['similarity']:.2f}")
    print(f"Text Representation: {result['text_representation']}")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from astrapy import DataAPIClient

# Initialize the DataAPIClient
client = DataAPIClient("your client")
db = client.get_database_by_api_endpoint(
    "https://0b0963cb-6b3a-4145-86c2-be7762e6cd9c-westus3.apps.astra.datastax.com"
)

# Define your collection name
collection_name = "ssai_vectordb"
collection = db.get_collection(collection_name)

# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight SBERT model

# Function to fetch all documents from the collection
def fetch_all_documents(collection):
    try:
        documents = list(collection.find({}))  # Retrieve all documents
        print(f"Retrieved {len(documents)} documents successfully!")
        return documents
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Fetch all documents from the collection
documents = fetch_all_documents(collection)

# Function to fetch related HTML and CSS for an instruction
def fetch_related_html_css(instruction_id, documents):
    html_components = [doc for doc in documents if doc["source_type"] == "HTML_Component" and doc["source_id"] == instruction_id]
    css_styles = [doc for doc in documents if doc["source_type"] == "CSS_Style" and doc["source_id"] == instruction_id]
    return html_components, css_styles

# Function to retrieve similar documents along with HTML and CSS
def retrieve_similar_documents_with_code(query_text, documents, top_k=5):
    try:
        # Embed the query
        query_embedding = sbert_model.encode([query_text], clean_up_tokenization_spaces=False)[0]

        # Extract embeddings and document IDs
        embeddings = np.array([eval(doc["embedding"]) if isinstance(doc["embedding"], str) else doc["embedding"] for doc in documents])
        ids = [doc["id"] for doc in documents]
        texts = [doc["text_representation"] for doc in documents]

        # Compute cosine similarity
        similarities = cosine_similarity([query_embedding], embeddings).flatten()

        # Get top K most similar documents
        top_indices = similarities.argsort()[-top_k:][::-1]
        results = []
        for i in top_indices:
            instruction_id = ids[i]
            html_components, css_styles = fetch_related_html_css(instruction_id, documents)
            results.append({
                "id": ids[i],
                "similarity": similarities[i],
                "text_representation": texts[i],
                "html_components": html_components,
                "css_styles": css_styles
            })

        return results
    except Exception as e:
        print(f"Error during retrieval: {e}")
        return []

# Function to create a heatmap for cosine similarity scores
import matplotlib.pyplot as plt
import seaborn as sns

def create_similarity_heatmap(results):
    try:
        ids = [result["id"] for result in results]
        similarities = [result["similarity"] for result in results]

        # Create a heatmap
        plt.figure(figsize=(10, 6))
        sns.heatmap([similarities], annot=True, cmap="coolwarm", xticklabels=ids, yticklabels=["Similarity"])
        plt.title("Cosine Similarity Heatmap for Retrieved Documents")
        plt.xlabel("Document IDs")
        plt.ylabel("Similarity")
        plt.show()
    except Exception as e:
        print(f"Error generating heatmap: {e}")

# Sample query text
query_text = "Design a modern header for a travel website with a minimalist layout."

# Retrieve top 5 similar documents with associated HTML and CSS
results = retrieve_similar_documents_with_code(query_text, documents, top_k=5)

# Display the results
print("Top Similar Documents with Associated Code:")
for result in results:
    print(f"\nID: {result['id']}")
    print(f"Similarity: {result['similarity']:.2f}")
    print(f"Text Representation: {result['text_representation']}")
    print("HTML Components:")
    for html in result["html_components"]:
        print(f"  - {html['text_representation']}")
    print("CSS Styles:")
    for css in result["css_styles"]:
        print(f"  - {css['text_representation']}")

# Create a heatmap for the similarity scores
create_similarity_heatmap(results)
