In [16]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_from_disk
# import streamlit as st
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_from_disk
import nltk
from nltk.tokenize import word_tokenize
import os

nltk.data.path.append(os.path.expanduser('~/nltk'))

In [3]:
def embed_query(query, model, tokenizer):
    """
    Generate an embedding for a query using a pre-trained model.
    """
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()



In [4]:
def semantic_search(query, code_embeddings, model, tokenizer, dataset, top_k=5):
    """
    Perform semantic search using cosine similarity.
    """
    query_embedding = embed_query(query, model, tokenizer)
    similarities = cosine_similarity(query_embedding, code_embeddings)

    top_indices = np.argsort(similarities[0])[::-1][:top_k]
    return top_indices, similarities[0][top_indices]


In [5]:
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load the code embeddings
code_embeddings = np.load('code_embeddings.npy')

dataset = load_from_disk('processed_dataset')
while True:
    query = input("Enter your search query (or type 'exit' to quit): ")
    if query.lower() == 'exit':
        break

        # Perform semantic search
    top_indices, scores = semantic_search(query, code_embeddings, model, tokenizer, dataset)
    print(f"\nTop {len(top_indices)} results for query: '{query}'\n")
    for idx, score in zip(top_indices, scores):
        idx = int(idx)  # Convert numpy.int64 to Python int
        print(f"Index: {idx}, Similarity Score: {score:.4f}")
        print(f"Code Snippet:\n{dataset[idx]['code']}\n")
        print(f"Description:\n{dataset[idx]['description']}\n")
        print("-" * 80)
    





Top 5 results for query: 'Transpose the columns into rows, remove all of the rows that are empty after the first cell, then     transpose back. '

Index: 990, Similarity Score: 0.9802
Code Snippet:
def drop_empty(rows):
    """Transpose the columns into rows, remove all of the rows that are empty after the first cell, then
    transpose back. The result is that columns that have a header but no data in the body are removed, assuming
    the header is the first row. """
    return zip(*[col for col in zip(*rows) if bool(filter(bool, col[1:]))])

Description:
Transpose the columns into rows, remove all of the rows that are empty after the first cell, then
    transpose back. The result is that columns that have a header but no data in the body are removed, assuming
    the header is the first row.

--------------------------------------------------------------------------------
Index: 779, Similarity Score: 0.9687
Code Snippet:
def get_naive(dt):
  """Gets a naive datetime from a dateti

In [None]:
import zipfile
import gzip
import json 
import os
def read_zip_file(zip_file_path):
    zip_content = {}
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            with zip_ref.open(file_info) as file:
                if file_info.filename.endswith('.jsonl.gz'):
                    # Handle gzipped JSONL files
                    content = gzip.decompress(file.read()).decode('utf-8')
                    json_content = [json.loads(line) for line in content.splitlines()]
                    zip_content[file_info.filename] = json_content
                elif file_info.filename.endswith('.jsonl'):
                    # Handle plain JSONL files
                    content = file.read().decode('utf-8')
                    json_content = [json.loads(line) for line in content.splitlines()]
                    zip_content[file_info.filename] = json_content
                elif file_info.filename.endswith('/'):
                    # Skip directory entries
                    continue
                else:
                    # For other file types, store as plain text
                    content = file.read().decode('utf-8', errors='ignore')
                    zip_content[file_info.filename] = content
    return zip_content

def read_all_files(folder_path):
    all_content = {}
    for filename in os.listdir(folder_path):
        print(filename)
        if filename.endswith('.zip'):
            zip_content = read_zip_file(os.path.join(folder_path, filename))
        all_content.update(zip_content)
        break
    # print(all_content)
    return all_content

read_all_files('/home/admin/huggingFace_dataset/datasets/python_data/python/final/jsonl/data')


In [12]:
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
def load_resources():
    # nltk.download('punkt')
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    
    # Load tokenizer and model
    model_name = "microsoft/codebert-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Load code embeddings and dataset
    code_embeddings = np.load('code_embeddings.npy')
    dataset = load_from_disk('processed_dataset')

    # Prepare BM25
    
    # tokenized_corpus = [word_tokenize(doc['code'].lower()) for doc in dataset]
    tokenized_corpus = [doc['code'].lower().split() for doc in dataset]
    
    bm25 = BM25Okapi(tokenized_corpus)

    return tokenizer, model, code_embeddings, dataset, bm25

def embed_query(query, model, tokenizer):
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def semantic_search(query, code_embeddings, model, tokenizer, dataset, top_k=5):
    query_embedding = embed_query(query, model, tokenizer)
    similarities = cosine_similarity(query_embedding, code_embeddings)
    top_indices = np.argsort(similarities[0])[::-1][:top_k]
    return top_indices, similarities[0][top_indices]

def bm25_search(query, bm25, top_k=5):
    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return top_indices, scores[top_indices]


def hybrid_search(query, bm25, code_embeddings, model, tokenizer, dataset, top_k=5, alpha=0.5):
    # BM25
    tokenized_query = query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Semantic
    query_embedding = embed_query(query, model, tokenizer)
    semantic_scores = cosine_similarity(query_embedding, code_embeddings).flatten()
    
    # Normalize scores
    bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
    semantic_norm = (semantic_scores - semantic_scores.min()) / (semantic_scores.max() - semantic_scores.min() + 1e-10)
    
    # Combine
    combined_scores = alpha * bm25_norm + (1 - alpha) * semantic_norm
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    return top_indices, combined_scores[top_indices]




In [24]:
import streamlit as st


In [25]:
def main():
    st.title("🔍 Semantic and BM25 Code Search")

    # Load resources
    tokenizer, model, code_embeddings, dataset, bm25 = load_resources()

    # User input
    query = st.text_input("Enter your search query:", "")

    if st.button("Search"):
        if query:
            st.markdown("### 🔹 Semantic Search Results:")
            top_indices_sem, scores_sem = semantic_search(query, code_embeddings, model, tokenizer, dataset)
            for idx, score in zip(top_indices_sem, scores_sem):
                idx = int(idx)
                st.write(f"**Index:** {idx} | **Similarity Score:** {score:.4f}")
                st.code(dataset[idx]['code'], language='python')  # Adjust language as needed
                st.write(f"**Description:** {dataset[idx]['description']}")
                st.markdown("---")

            st.markdown("### 🔸 BM25 Search Results:")
            top_indices_bm25, scores_bm25 = bm25_search(query, bm25)
            for idx, score in zip(top_indices_bm25, scores_bm25):
                idx = int(idx)
                st.write(f"**Index:** {idx} | **BM25 Score:** {score:.4f}")
                st.code(dataset[idx]['code'], language='python')  # Adjust language as needed
                st.write(f"**Description:** {dataset[idx]['description']}")
                st.markdown("---")
        else:
            st.warning("Please enter a search query.")

if __name__ == "__main__":
    main()

2024-10-20 19:13:06.842 
  command:

    streamlit run /home/admin/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-10-20 19:13:08.022 Session state does not function when running a script without `streamlit run`


In [22]:
tokenizer, model, code_embeddings, dataset, bm25 = load_resources()
query = input("Enter your search query (or type 'exit' to quit): ")

top_indices, scores = hybrid_search(query, bm25, code_embeddings, model, tokenizer, dataset, top_k=5, alpha=0.5)

for idx, score in zip(top_indices, scores):
    idx = int(idx)  # Convert numpy.int64 to Python int
    print(f"Index: {idx}, Similarity Score: {score:.4f}")
    print(f"Code Snippet:\n{dataset[idx]['code']}\n")
    print(f"Description:\n{dataset[idx]['description']}\n")
    print("-" * 80)
    

Index: 990, Similarity Score: 1.0000
Code Snippet:
def drop_empty(rows):
    """Transpose the columns into rows, remove all of the rows that are empty after the first cell, then
    transpose back. The result is that columns that have a header but no data in the body are removed, assuming
    the header is the first row. """
    return zip(*[col for col in zip(*rows) if bool(filter(bool, col[1:]))])

Description:
Transpose the columns into rows, remove all of the rows that are empty after the first cell, then
    transpose back. The result is that columns that have a header but no data in the body are removed, assuming
    the header is the first row.

--------------------------------------------------------------------------------
Index: 290, Similarity Score: 0.5500
Code Snippet:
def _index_document(self, document, force=False):
        """ Adds dataset document to the index. """
        query = text("""
            INSERT INTO dataset_index(vid, title, keywords, doc)
            VAL

In [15]:
print(nltk.data.path)

['/home/admin/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
