In [None]:
pip --version

In [None]:
!pip install torch transformers pyserini numpy pandas matplotlib faiss-cpu

In [None]:
!pip install datasets
from datasets import load_dataset

dataset_python = load_dataset("code_search_net", "python", trust_remote_code=True)
dataset_java = load_dataset("code_search_net", "java", trust_remote_code=True)

In [None]:
import pandas as pd

# Extract relevant fields
filtered_data_python = [
    {
        "id": i,
        "language": entry["language"],
        "function_name": entry["func_name"],
        "code": entry["func_code_string"],
        "docstring": entry["func_documentation_string"],
        "file_path": entry["func_path_in_repository"],
    }
    for i, entry in enumerate(dataset_python["train"])
]

# Convert to DataFrame
df_python = pd.DataFrame(filtered_data_python)

print(df_python.head())

In [None]:
import pandas as pd

# Extract relevant fields
filtered_data_java = [
    {
        "id": i,
        "language": entry["language"],
        "function_name": entry["func_name"],
        "code": entry["func_code_string"],
        "docstring": entry["func_documentation_string"],
        "file_path": entry["func_path_in_repository"]
    }
    for i, entry in enumerate(dataset_java["train"])
]

# Convert to DataFrame
df_java = pd.DataFrame(filtered_data_java)

print(df_java.head())

In [None]:
import re

def clean_python_code(code):
    if not isinstance(code, str):
        return ""
    # Keep docstrings but remove other comments
    code = re.sub(r'(?<![\'"])#.*$', '', code, flags=re.MULTILINE)
    # Normalize whitespace but don't remove it completely
    code = re.sub(r'\s+', ' ', code).strip()
    return code

def clean_java_code(code):
    if not isinstance(code, str):
        return ""
    # Remove single-line comments
    code = re.sub(r'//.*$', '', code, flags=re.MULTILINE)
    # Remove multi-line comments (but preserve docstrings)
    code = re.sub(r'/\*(?!\*).*?\*/', '', code, flags=re.DOTALL)
    # Normalize whitespace
    code = re.sub(r'\s+', ' ', code).strip()
    return code

df_python["clean_code"] = df_python["code"].apply(clean_python_code)
df_java["clean_code"] = df_java["code"].apply(clean_java_code)

In [None]:
def tokenize_code_identifiers(text):
    if not isinstance(text, str):
        return ""
    # Split camelCase
    text = re.sub(r'([a-z0-9])([A-Z])', r'\1 \2', text)
    # Split snake_case
    text = re.sub(r'_', ' ', text)
    return text

# Apply to function names and add as a new column
df_python['tokenized_function'] = df_python['function_name'].apply(tokenize_code_identifiers)
df_java['tokenized_function'] = df_java['function_name'].apply(tokenize_code_identifiers)

In [None]:
# Create a combined text field with appropriate weighting
def create_indexed_content(row):
    # Give more weight to function name by repeating it
    function_name = row['tokenized_function'] + " " + row['function_name']
    function_name = function_name.strip() * 3  # Repeat for higher weight
    
    docstring = row['docstring'] if isinstance(row['docstring'], str) else ""
    code = row['code'] if isinstance(row['code'], str) else ""
    
    # Combine with appropriate structure
    return f"{function_name} {docstring} {code}"

df_python['indexed_content'] = df_python.apply(create_indexed_content, axis=1)
df_java['indexed_content'] = df_java.apply(create_indexed_content, axis=1)

In [None]:
# Add language-specific prefixes to help with language filtering
df_python['indexed_content'] = "python_language " + df_python['indexed_content'] 
df_java['indexed_content'] = "java_language " + df_java['indexed_content']

In [None]:
def preprocess_query(query):
    # Convert to lowercase
    query = query.lower().strip()
    
    # Extract language preference
    language_prefix = ""
    if "python" in query.lower():
        language_prefix = "python_language "
    elif "java" in query.lower():
        language_prefix = "java_language "
    
    # Handle camelCase and snake_case in programming identifiers
    # This helps match functions like "bubbleSort" or "binary_search"
    query = re.sub(r'([a-z0-9])([A-Z])', r'\1 \2', query)  # Split camelCase
    query = re.sub(r'_', ' ', query)  # Split snake_case
    
    # Remove special characters but preserve important coding symbols
    query = re.sub(r'[^\w\s\.\(\)\[\]_]', '', query)
    
    return language_prefix + query

sample_query = "Binary search in Python"
print(preprocess_query(sample_query)) # output: "binary search in python"

In [None]:
# Save processed data

df_python.to_json("codesearchnet_python.json", orient="records", indent=2)
df_java.to_json("codesearchnet_java.json", orient="records", indent=2)