In [None]:
!pip install faiss-cpu
!pip install sentence-transformers

In [None]:
import pandas as pd
import faiss
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [None]:
csv_file_path = '/kaggle/input/ipc-preprocessed-solutions/preprocessed_solutions_v1.csv'
df = pd.read_csv(csv_file_path)

In [None]:
df['preprocessed_solution'] = df['preprocessed_solution'].str.replace('\n', ' ')
df['preprocessed_solution'] = df['preprocessed_solution'].str.replace('\r', ' ')
df = df.drop_duplicates(subset='preprocessed_solution')

In [None]:
code_snippets = df['preprocessed_solution'].tolist()

In [None]:
class DocumentSearch:
    def __init__(self, model_name, docs_text, dataframe):
        self.model_name = model_name
        self.docs_text = docs_text
        self.df = dataframe
        self.embedding_model = None
        self.index = None
        self.docs_embedding = None
        self.dim = 0
        self.load_model()
        self.load_faiss_index()
#         self.do_embedding()
    
    def cosine_similarity(self, text1, text2):
        embeddings = self.embedding_model.encode([text1, text2], show_progress_bar=False)
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        similarity = np.dot(embeddings[0], embeddings[1])
        return similarity

    def do_embedding(self):
#         self.embedding_model = SentenceTransformer(self.model_name)
        self.docs_embeddings = self.embedding_model.encode(self.docs_text, show_progress_bar=True, convert_to_numpy=True)
        self.dim = self.docs_embeddings.shape[1]  # Dimension of the embeddings
        print(f"Dimensions = {self.dim}")
        self.index = faiss.IndexFlatIP(self.dim)  # Use a FlatIP index for inner product (cosine similarity)
        self.index.add(self.docs_embeddings)  # Add the embeddings to the index
    
    def save_model(self, directory="model_directory"):
        self.embedding_model.save(directory)
    
    def save_faiss_index(self, file_path="faiss_index.bin"):
        faiss.write_index(self.index, file_path)
    
    def load_model(self, directory="/kaggle/working/model_directory"):
        self.embedding_model = SentenceTransformer(directory)

    def load_faiss_index(self, file_path="/kaggle/working/faiss_index.bin"):
        self.index = faiss.read_index(file_path)
        
    def query(self, query_text, k=10):
        query_embedding = self.embedding_model.encode([query_text], convert_to_numpy=True)
        D, I = self.index.search(query_embedding, k)  # D: distances, I: indices
        similar_problems = self.df.iloc[I[0]]  # I[0] because `I` is a list of lists
        print("Top similar problems:")
        for i, idx in enumerate(I[0]):
            print(f"{i+1}: {self.df['problem_statement'].iloc[idx]} {self.df['problem_link'].iloc[idx]} (Similarity: {D[0][i]})")

In [None]:
!pip install voyageai

In [None]:
import os
import voyageai


os.environ['VOYAGE_API_KEY'] = "YOUR_API_KEY"
vo = voyageai.Client(api_key=os.environ.get("VOYAGE_API_KEY"))

In [None]:
class CodeSearch:
    def __init__(self, model_name, code_snippets, dataframe):
        self.model_name = model_name
        self.code_snippets = code_snippets
        self.df = dataframe
        self.embedding_model = None
        self.index = None
        self.codes_embedding = None
        self.dim = 0
        self.do_embedding()

    def do_embedding(self):
#         self.embedding_model = SentenceTransformer(self.model_name, trust_remote_code=True)
#         self.codes_embedding = self.embedding_model.encode(self.code_snippets, show_progress_bar=True, convert_to_numpy=True)
        self.codes_embeddings = vo.embed(code_snippets, model="voyage-code-2", input_type="document").embeddings
        self.dim = self.codes_embedding.shape[1]  # Dimension of the embeddings
        print(f"Dimensions = {self.dim}")
        self.index = faiss.IndexFlatIP(self.dim)  # Use a FlatIP index for inner product (cosine similarity)
        self.index.add(self.codes_embedding)  # Add the embeddings to the index
    
    def save_model(self, directory="model_directory"):
        self.embedding_model.save(directory)
    
    def save_faiss_index(self, file_path="faiss_index.bin"):
        faiss.write_index(self.index, file_path)
        
    def query(self, query_code, k=10):
#         query_embedding = self.embedding_model.encode([query_code], convert_to_numpy=True)
        query_embedding = vo.embed([query_code], model="voyage-code-2", input_type="query").embeddings[0]
        D, I = self.index.search(query_embedding, k)  # D: distances, I: indices
        similar_problems = self.df.iloc[I[0]]  # I[0] because `I` is a list of lists
        print("Top similar problems:")
        for i, idx in enumerate(I[0]):
            print(f"{i+1}: {self.df['solution'].iloc[idx]} {self.df['problem_link'].iloc[idx]} (Similarity: {D[0][i]})")
    
    def save_embeddings_to_csv(self, filepath):
        embeddings_df = pd.DataFrame(self.codes_embedding)
        embeddings_df.to_csv(filepath, index=False)
        print(f"Embeddings saved to {filepath}")

In [None]:
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# model_name = "thenlper/gte-large"
# model_name = "BAAI/bge-large-en-v1.5"
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "jinaai/jina-embeddings-v2-base-en"
# model_name = "jinaai/jina-embeddings-v2-small-en"

In [None]:
search_engine = CodeSearch(model_name, code_snippets, df)
# search_engine.save_embeddings_to_csv("st-all-mpnet-base-v2-embeddings.csv")

In [None]:
import re

def remove_comments(code):
    # Remove single line & multi-line comments
    regex = '\/\/.*|\/\*(\S|\s)*\*\/'
    code = re.sub(regex, '', code)
    return code

def remove_directives_and_namespace(code):
    # Remove the include directives
    code = re.sub(r'#include.*', '', code)
    # Remove the using namespace
    code = re.sub(r'using namespace.*', '', code)
    return code

def remove_non_ascii(code):
    return code.encode('ascii', 'ignore').decode('ascii')

def clean_code(code):
    if code:
        return code.replace('\n', ' ').replace('\r', ' ')

# Preprocess query
def preprocess_query(code):
    code = remove_comments(code)
    code = remove_non_ascii(code)
    code = remove_directives_and_namespace(code)
    code = clean_code(code)
    return code

In [None]:
# search_engine.save_model()

In [None]:
# search_engine.save_faiss_index()

In [None]:
query = """
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
const int mod = 1e9 + 7;
const int N = 1e5 + 5;

void testCase()
{
    int n, u, v, root = 0;
    ll ans = LLONG_MAX;

    cin >> n;

    vector<int> color(n);
    vector<vector<int>> cost(3, vector<int>(n));
    vector<vector<int>> adj(n);

    for (int i = 0; i < 3; ++i)
    {
        for (auto &c : cost[i])
            cin >> c;
    }

    for (int i = 0; i < n - 1; ++i)
    {
        cin >> u >> v;
        u--, v--;
        adj[u].push_back(v);
        adj[v].push_back(u);
    }

    for (int i = 0; i < n; ++i)
    {
        if ((int)adj[i].size() > 2)
            return void(cout << "-1");
        if (adj[i].size() == 1)
            root = i;
    }

    vector<int> perm = {0, 1, 2};

    function<ll(int, int, int)> dfs = [&](int u, int par, int ind)
    {
        ll ans = cost[perm[ind]][u];
        for (auto v : adj[u])
        {
            if (v != par)
                ans += dfs(v, u, (ind + 1) % 3);
        }
        return ans;
    };

    function<void(int, int, int)> dfs_ans = [&](int u, int par, int ind)
    {
        color[u] = perm[ind];
        for (auto v : adj[u])
        {
            if (v != par)
                dfs_ans(v, u, (ind + 1) % 3);
        }
    };
    do
    {
        ll ret = dfs(root, root, 0);
        if (ret < ans)
        {
            ans = ret;
            dfs_ans(root, root, 0);
        }
    } while (next_permutation(perm.begin(), perm.end()));

    cout << ans << '\n';
    for (int i = 0; i < n; ++i)
        cout << color[i] + 1 << ' ';
}

signed main()
{
    ios_base::sync_with_stdio(false);
    cin.tie(NULL);

    int testCases = 1;

    // cin >> testCases;

    while (testCases--)
        testCase();

    return 0;
}
"""

In [None]:
preprocessed_query = preprocess_query(query)

In [None]:
search_engine.query(preprocessed_query, k=5)  # Retrieve top 5 similar problems