In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [2]:
import faiss
import numpy as np
from transformers import RobertaTokenizer, RobertaModel
import torch
from tqdm import tqdm

# Load CodeBERT model
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Batched embedding function
def get_embeddings_batch(code_snippets, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(code_snippets), batch_size)):
        batch = code_snippets[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256)
        inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Load dataset
with open("/content/train.java-cs.txt.java", "r", encoding="utf-8") as f_java, \
     open("/content/train.java-cs.txt.cs", "r", encoding="utf-8") as f_cs:
    java_snippets = [line.strip() for line in f_java]
    cs_snippets = [line.strip() for line in f_cs]

# Get embeddings (batched)
print("Encoding Java snippets...")
java_vectors = get_embeddings_batch(java_snippets)
print("Encoding C# snippets...")
cs_vectors = get_embeddings_batch(cs_snippets)

# Convert to float32
java_vectors_np = java_vectors.astype("float32")
cs_vectors_np = cs_vectors.astype("float32")

# Build FAISS indices
dimension = 768
java_index = faiss.IndexFlatL2(dimension)
cs_index = faiss.IndexFlatL2(dimension)
java_index.add(java_vectors_np)
cs_index.add(cs_vectors_np)

print("FAISS indices built successfully!")
print(f"Java snippets: {len(java_snippets)}")
print(f"C# snippets: {len(cs_snippets)}")

# Save mapping
import pickle
with open("retrieval_mapping.pkl", "wb") as f:
    pickle.dump({"java": java_snippets, "cs": cs_snippets}, f)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Encoding Java snippets...



  return forward_call(*args, **kwargs)

  0%|          | 1/322 [00:01<09:55,  1.85s/it][A
  1%|          | 2/322 [00:02<05:15,  1.02it/s][A
  1%|          | 3/322 [00:02<04:07,  1.29it/s][A
  1%|          | 4/322 [00:03<03:08,  1.69it/s][A
  2%|▏         | 5/322 [00:03<02:46,  1.90it/s][A
  2%|▏         | 6/322 [00:03<02:43,  1.93it/s][A
  2%|▏         | 7/322 [00:04<02:43,  1.92it/s][A
  2%|▏         | 8/322 [00:04<02:14,  2.34it/s][A
  3%|▎         | 9/322 [00:05<02:17,  2.28it/s][A
  3%|▎         | 10/322 [00:05<02:03,  2.53it/s][A
  3%|▎         | 11/322 [00:05<01:58,  2.63it/s][A
  4%|▎         | 12/322 [00:06<02:15,  2.29it/s][A
  4%|▍         | 13/322 [00:06<02:25,  2.13it/s][A
  4%|▍         | 14/322 [00:07<02:31,  2.04it/s][A
  5%|▍         | 15/322 [00:07<02:18,  2.22it/s][A
  5%|▍         | 16/322 [00:08<02:33,  2.00it/s][A
  5%|▌         | 17/322 [00:08<02:12,  2.30it/s][A
  6%|▌         | 18/322 [00:09<02:15,  2.24it/s][A
  6%|▌         | 19/322 [00:09<0

Encoding C# snippets...


100%|██████████| 322/322 [01:52<00:00,  2.87it/s]


FAISS indices built successfully!
Java snippets: 10295
C# snippets: 10295


In [3]:
np.save("java_vectors.npy", java_vectors)
np.save("cs_vectors.npy", cs_vectors)

In [4]:
def get_embedding(code_snippet):
    inputs = tokenizer(code_snippet, return_tensors="pt", truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)
        return embedding.squeeze().cpu().numpy()

# Example query
query = "public void hello() { System.out.println(\"Hello World\"); }"
query_embedding = get_embedding(query).astype("float32").reshape(1, -1)

# Retrieve top 3 similar
k = 3
D, I = cs_index.search(query_embedding, k)

with open("retrieval_mapping.pkl", "rb") as f:
    mapping = pickle.load(f)

top_matches = [mapping["cs"][i] for i in I[0]]
print("Top translations:\n", top_matches)


Top translations:
 ['public override String ToString(){StringBuilder buffer = new StringBuilder();buffer.Append("[BottomMargin]\\n");buffer.Append("    .margin               = ").Append(" (").Append(Margin).Append(" )\\n");buffer.Append("[/BottomMargin]\\n");return buffer.ToString();}', 'public override String ToString(){StringBuilder buffer = new StringBuilder();buffer.Append("[USESELFS]\\n");buffer.Append("    .flag            = ").Append(HexDump.ShortToHex(_options)).Append("\\n");buffer.Append("[/USESELFS]\\n");return buffer.ToString();}', 'public override String ToString(){StringBuilder buffer = new StringBuilder();buffer.Append("[PRECISION]\\n");buffer.Append("    .precision       = ").Append(FullPrecision).Append("\\n");buffer.Append("[/PRECISION]\\n");return buffer.ToString();}']


In [5]:
for translation in top_matches:
    print(translation)

public override String ToString(){StringBuilder buffer = new StringBuilder();buffer.Append("[BottomMargin]\n");buffer.Append("    .margin               = ").Append(" (").Append(Margin).Append(" )\n");buffer.Append("[/BottomMargin]\n");return buffer.ToString();}
public override String ToString(){StringBuilder buffer = new StringBuilder();buffer.Append("[USESELFS]\n");buffer.Append("    .flag            = ").Append(HexDump.ShortToHex(_options)).Append("\n");buffer.Append("[/USESELFS]\n");return buffer.ToString();}
public override String ToString(){StringBuilder buffer = new StringBuilder();buffer.Append("[PRECISION]\n");buffer.Append("    .precision       = ").Append(FullPrecision).Append("\n");buffer.Append("[/PRECISION]\n");return buffer.ToString();}


In [43]:
# Choose direction
direction = "cs-to-java"  # or "cs-to-java"

In [44]:
if direction == "java-to-cs":
    index = faiss.IndexFlatL2(dimension)
    cs_vectors = np.load("cs_vectors.npy")
    index.add(cs_vectors)
    prompt_template = """You are a code translation assistant. Your job is to translate Java code into equivalent, idiomatic C# code.
    Return ONLY the raw C# code, without any formatting, comments, or explanations.
### Context Examples:
{top_matches}

### Java Code:
{query}

### C# Translation:"""
    context_src = java_snippets
    context_tgt = cs_snippets
elif direction == "cs-to-java":
    index = faiss.IndexFlatL2(dimension)
    java_vectors = np.load("java_vectors.npy")
    index.add(java_vectors)
    prompt_template = """You are a code translation assistant. Your job is to translate C# code into equivalent, idiomatic Java code.
    Return ONLY the raw Java code, without any formatting, comments, or explanations.
### Context Examples:
{top_matches}

### C# Code:
{query}

### Java Translation:"""
    context_src = cs_snippets
    context_tgt = java_snippets

In [45]:
prompt_template

'You are a code translation assistant. Your job is to translate C# code into equivalent, idiomatic Java code. \n    Return ONLY the raw Java code, without any formatting, comments, or explanations.\n### Context Examples:\n{top_matches}\n\n### C# Code:\n{query}\n\n### Java Translation:'

In [46]:
# === Input Query ===
query = "public void hello() { System.out.println(\"Hello World\"); }" if direction == "java-to-cs" else \
        "public void hello() { Console.WriteLine(\"Hello World\"); }"

query_embedding = get_embedding(query).astype("float32").reshape(1, -1)
D, I = index.search(query_embedding, k=3)

In [47]:
# Get top matches from the search results
top_indices = I[0]  # Get the indices of top matches
top_matches = "\n".join([
    f"{'Java' if direction == 'java-to-cs' else 'C#'}:\n{context_src[i]}\n" +
    f"{'C#' if direction == 'java-to-cs' else 'Java'}:\n{context_tgt[i]}\n"
    for i in top_indices
])

In [48]:
# Now format the prompt
final_prompt = prompt_template.format(top_matches=top_matches, query=query)

In [49]:
final_prompt

'You are a code translation assistant. Your job is to translate C# code into equivalent, idiomatic Java code. \n    Return ONLY the raw Java code, without any formatting, comments, or explanations.\n### Context Examples:\nC#:\npublic virtual E next(){if (this.expectedModCount == this._enclosing.modCount){try{E result = this._enclosing.get(this.pos + 1);this.lastPosition = ++this.pos;return result;}catch (System.IndexOutOfRangeException){throw new java.util.NoSuchElementException();}}throw new java.util.ConcurrentModificationException();}\nJava:\npublic E next() {if (expectedModCount == modCount) {try {E result = get(pos + 1);lastPosition = ++pos;return result;} catch (IndexOutOfBoundsException e) {throw new NoSuchElementException();}}throw new ConcurrentModificationException();}\n\nC#:\npublic PatternReplaceFilterFactory(IDictionary<string, string> args) : base(args){pattern = GetPattern(args, "pattern");replacement = Get(args, "replacement");replaceAll = "all".Equals(Get(args, "replac

In [50]:
from openai import OpenAI

client = OpenAI(
    api_key="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",
    base_url="https://api.groq.com/openai/v1"  # Groq's OpenAI-compatible endpoint
)

def call_deepseek(prompt, model="llama-3.3-70b-versatile", max_tokens=300):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that translates code from one language to another."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5,
        top_p=0.95,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

In [51]:
translated_java = call_deepseek(final_prompt).split("### Java Translation:")[-1].strip()
translated_java

'public void hello() { System.out.println("Hello World"); }'