In [None]:
import os
import re

def load_codebase(directory_path):
    """
    Loads all files from a directory and returns a dictionary with filenames as keys.
    """
    code_files = {}
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(('.rb', '.js', '.erb', '.html', '.css')) or any(s in file for s in ['Gemfile', 'package.json']):
                file_path = os.path.join(root, file)
                print(file_path)
                with open(file_path, 'r', encoding='utf-8') as f:
                    code_files[file_path] = f.read()
    return code_files

def preprocess_code(file_content):
    """''
    Splits code into logical chunks (e.g., functions or classes) with their context.
    """
    chunks = re.split(r'\n\s*\n', file_content)  # Split by blank lines
    return [chunk.strip() for chunk in chunks if chunk.strip()]

# Example usage
codebase = load_codebase('/kaggle/input/webspire-github-repository')  # Provide the codebase directory path
preprocessed_code = {file: preprocess_code(content) for file, content in codebase.items()}


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for each chunk
def create_embeddings(preprocessed_code):
    """
    Converts preprocessed code chunks into embeddings for semantic search.
    """
    embeddings = {}
    for file, chunks in preprocessed_code.items():
        if chunks:
            chunk_embeddings = embedding_model.encode(chunks, convert_to_tensor=False)  # Generate embeddings
            chunk_embeddings = np.array(chunk_embeddings)  # Ensure it's a NumPy array
            if chunk_embeddings.ndim == 1:  # If 1D, reshape to 2D
                chunk_embeddings = chunk_embeddings.reshape(1, -1)
            embeddings[file] = {
                "chunks": chunks,
                "embeddings": chunk_embeddings,
            }
    return embeddings

embeddings = create_embeddings(preprocessed_code)


In [None]:
# Debug: Check the structure of the `embeddings` dictionary
print(f"Number of files processed: {len(embeddings)}")
for file, data in embeddings.items():
    print(f"File: {file}, Number of chunks: {len(data['chunks'])}, Embedding Shape: {data['embeddings'].shape}")


In [None]:
from transformers import pipeline

# Initialize the LLM pipeline
llm = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    trust_remote_code=True,
    device_map="auto",
)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_relevant_chunks(query, embeddings, top_k=3):
    """
    Retrieves the top-k relevant code chunks for a given query.
    """
    # Ensure query_embedding is reshaped as a 2D array
    query_embedding = embedding_model.encode(query, convert_to_tensor=False).reshape(1, -1)

    results = []

    for file, data in embeddings.items():
        # Ensure chunk embeddings are 2D
        chunk_embeddings = np.array(data["embeddings"])
        if chunk_embeddings.ndim == 1:
            chunk_embeddings = chunk_embeddings.reshape(1, -1)

        # Compute cosine similarity
        similarities = cosine_similarity(query_embedding, chunk_embeddings).flatten()

        # Get top-k indices for this file
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        for idx in top_indices:
            results.append((file, data["chunks"][idx], similarities[idx]))

    # Sort overall results by similarity
    results = sorted(results, key=lambda x: x[2], reverse=True)
    return results[:top_k]


# Example usage
query = "How the images & videos uploaded to a post are stored ?"
retrieved_chunks = retrieve_relevant_chunks(query, embeddings)
for file, chunk, similarity in retrieved_chunks:
    print(f"File: {file}\nSimilarity: {similarity}\nChunk:\n{chunk}\n")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

File: /kaggle/input/webspire-github-repository/config/environments/production.rb
Similarity: 0.408914715051651
Chunk:
# Store uploaded files on the local file system (see config/storage.yml for options).
  config.active_storage.service = :local

File: /kaggle/input/webspire-github-repository/config/environments/development.rb
Similarity: 0.4089146852493286
Chunk:
# Store uploaded files on the local file system (see config/storage.yml for options).
  config.active_storage.service = :local

File: /kaggle/input/webspire-github-repository/app/models/post.rb
Similarity: 0.3798864483833313
Chunk:
validates :posts, presence: true, blob: { content_type: ['image/png', 'image/jpg', 'image/jpeg', 'video/mp4'] }



In [None]:

def generate_answer(query, retrieved_chunks):
    """
    Uses the retrieved chunks as context to generate an answer.
    """
    context = "\n\n".join([chunk for _, chunk, _ in retrieved_chunks])
    prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        f"Answer:"
    )
    response = llm(
        prompt,
        max_new_tokens=100,
        num_return_sequences=1,
        do_sample=False,
    )
    return response[0]['generated_text']

# Example usage
answer = generate_answer(query, retrieved_chunks)
print("Generated Answer:\n", answer)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
