In [5]:
import os
import re

def load_codebase(directory_path):
    """
    Loads all files from a directory and returns a dictionary with filenames as keys.
    """
    code_files = {}
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(('.rb', '.js', '.erb', '.html', '.css')):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    code_files[file_path] = f.read()
    return code_files

def preprocess_code(file_content):
    """
    Splits code into logical chunks (e.g., functions or classes) with their context.
    """
    chunks = re.split(r'\n\s*\n', file_content)  # Split by blank lines
    return [chunk.strip() for chunk in chunks if chunk.strip()]

# Example usage
codebase = load_codebase('/kaggle/input/webspire-github-repository')  # Provide the codebase directory path
preprocessed_code = {file: preprocess_code(content) for file, content in codebase.items()}


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for each chunk
def create_embeddings(preprocessed_code):
    embeddings = {}
    for file, chunks in preprocessed_code.items():
        embeddings[file] = {
            "chunks": chunks,
            "embeddings": embedding_model.encode(chunks, convert_to_tensor=True)
        }
    return embeddings

embeddings = create_embeddings(preprocessed_code)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_relevant_chunks(query, embeddings, top_k=3):
    """
    Retrieves the top-k relevant code chunks for a given query.
    """
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    results = []

    for file, data in embeddings.items():
        similarities = cosine_similarity(query_embedding.reshape(1, -1), data["embeddings"]).flatten()
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        for idx in top_indices:
            results.append((file, data["chunks"][idx], similarities[idx]))
    
    # Sort overall results by similarity
    results = sorted(results, key=lambda x: x[2], reverse=True)
    return results[:top_k]

# Example usage
query = "Which library is used for infinite scrolling?"
retrieved_chunks = retrieve_relevant_chunks(query, embeddings)
for file, chunk, similarity in retrieved_chunks:
    print(f"File: {file}\nSimilarity: {similarity}\nChunk:\n{chunk}\n")


In [None]:
from transformers import pipeline

# Initialize the LLM pipeline
llm = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    trust_remote_code=True,
    device_map="auto",
)

def generate_answer(query, retrieved_chunks):
    """
    Uses the retrieved chunks as context to generate an answer.
    """
    context = "\n\n".join([chunk for _, chunk, _ in retrieved_chunks])
    prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        f"Answer:"
    )
    response = llm(
        prompt,
        max_new_tokens=100,
        num_return_sequences=1,
        do_sample=False,
    )
    return response[0]['generated_text']

# Example usage
answer = generate_answer(query, retrieved_chunks)
print("Generated Answer:\n", answer)
