## Prepare Environment

In [None]:
!pip install faiss-cpu
!pip install sentence-transformers

In [None]:
import pandas as pd
import faiss
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [None]:
csv_file_path = '/kaggle/input/ipc-preprocessed-solutions/preprocessed_solutions_v1.csv'
df = pd.read_csv(csv_file_path)

In [None]:
df.info()

In [None]:
code_snippets = df['preprocessed_solution'].tolist()

## Embeddings and Vector Databases

In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")


In [None]:
class CodeSearch:
    def __init__(self, model_name, code_snippets, dataframe):
        self.model_name = model_name
        self.code_snippets = code_snippets
        self.df = dataframe
        self.embedding_model = None
        self.index = None
        self.codes_embedding = None
        self.dim = 0
        self.do_embedding()

    def do_embedding(self):
        self.embedding_model = SentenceTransformer(self.model_name, trust_remote_code=True)
        self.codes_embedding = self.embedding_model.encode(self.code_snippets, show_progress_bar=True, convert_to_numpy=True)
        self.dim = self.codes_embedding.shape[1]  # Dimension of the embeddings
        print(f"Dimensions = {self.dim}")
        self.index = faiss.IndexFlatIP(self.dim)  # Use a FlatIP index for inner product (cosine similarity)
        self.index.add(self.codes_embedding)  # Add the embeddings to the index

    def query(self, query_code, k=10):
        query_embedding = self.embedding_model.encode([query_code], convert_to_numpy=True)
        D, I = self.index.search(query_embedding, k)  # D: distances, I: indices
        similar_problems = self.df.iloc[I[0]]  # I[0] because `I` is a list of list
        results = []
        for i, idx in enumerate(I[0]):
            results.append({
                "problem_link": df.iloc[idx]['problem_link']
            })
        return results
    
    def save_embeddings_to_csv(self, filepath):
        embeddings_df = pd.DataFrame(self.codes_embedding)
        embeddings_df.to_csv(filepath, index=False)
        print(f"Embeddings saved to {filepath}")

In [None]:
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# model_name = "thenlper/gte-large"
# model_name = "BAAI/bge-large-en-v1.5"
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "jinaai/jina-embeddings-v2-base-en"
# model_name = "jinaai/jina-embeddings-v2-small-en"

In [None]:
search_engine = CodeSearch(model_name, code_snippets, df)
# search_engine.save_embeddings_to_csv("st-all-mpnet-base-v2-embeddings.csv")

### Preprocessing

In [None]:
import re

def remove_comments(code):
    # Remove single line & multi-line comments
    regex = '\/\/.*|\/\*(\S|\s)*\*\/'
    code = re.sub(regex, '', code)
    return code

def remove_directives_and_namespace(code):
    # Remove the include directives
    code = re.sub(r'#include.*', '', code)
    # Remove the using namespace
    code = re.sub(r'using namespace.*', '', code)
    return code

def remove_non_ascii(code):
    return code.encode('ascii', 'ignore').decode('ascii')

def clean_code(code):
    if code:
        return code.replace('\n', ' ').replace('\r', ' ')

# Preprocess query
def preprocess_query(code):
    code = remove_comments(code)
    code = remove_non_ascii(code)
    code = remove_directives_and_namespace(code)
    code = clean_code(code)
    return code

In [None]:
query = """
int main(){
    int n;
    cin >> n;
    for (int i = 0; i < n; i++)
        cout << "Hello" << endl;
    return 0;
}
"""

preprocessed_query = preprocess_query(query)

In [None]:
search_engine.query(preprocessed_query, k=5)  # Retrieve top 5 similar problems

In [None]:
def calculate_recall_at_ks_solutions(solutions, ks):
    recalls = {k: 0 for k in ks}  # Dictionary to store hits for each k
    total = len(solutions)
    
    for index, row in solutions.iterrows():
        similar_solutions = search_engine.query(preprocess_query(row['solution']), max(ks))  # Fetch the maximum k results
        for k in ks:
            matches = [sol for sol in similar_solutions[:k] if sol['problem_link'] == row['problem_link']]
            if len(matches) >= 2:  # Check if there are at least two matches (including the original)
                recalls[k] += 1
    
    # Calculate recall for each k
    recall_at_ks = {k: recalls[k] / total for k in ks}
    return recall_at_ks

In [None]:
filtered_solutions = df[df['online_judge'] == 'AtCoder']
sampled_solutions = filtered_solutions.sample(n=1000, random_state=42) 

In [None]:
ks = [2, 4, 6, 11]  # Define multiple k values (ignore original solution)
recall_results = calculate_recall_at_ks_solutions(sampled_solutions, ks)
for k, recall in recall_results.items():
    print(f"Recall@{k-1}: {recall:.2%}")

In [None]:
for k, recall in recall_results.items():
    print(f"Recall@{k-1}: {recall:.2%}")

## Clustering

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AgglomerativeClustering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
np.random.seed(42)  # For reproducibility

In [None]:
def calculate_clustering_metrics(embeddings, cluster_labels):
    silhouette = silhouette_score(embeddings, cluster_labels)
    davies_bouldin = davies_bouldin_score(embeddings, cluster_labels)
    calinski_harabasz = calinski_harabasz_score(embeddings, cluster_labels)
    
    return {
        'silhouette_score': silhouette,
        'davies_bouldin_score': davies_bouldin,
        'calinski_harabasz_score': calinski_harabasz
    }

In [None]:
def run_kmeans_clustering(embedding, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    labels = kmeans.fit_predict(embedding)
    metrics = calculate_clustering_metrics(embedding, labels)
    return labels, metrics

def run_agglomerative_clustering(embedding, n_clusters):
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    labels = agglomerative.fit_predict(embedding)
    metrics = calculate_clustering_metrics(embedding, labels)
    return labels, metrics

def run_dbscan_clustering(embeddings, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(embeddings)
    metrics = calculate_clustering_metrics(embeddings, labels)
    return labels, metrics

def run_spectral_clustering(embeddings, n_clusters):
    spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors')
    labels = spectral.fit_predict(embeddings)
    metrics = calculate_clustering_metrics(embeddings, labels)
    return labels, metrics

def run_mean_shift_clustering(embeddings, bandwidth=None):
    mean_shift = MeanShift(bandwidth=bandwidth)
    labels = mean_shift.fit_predict(embeddings)
    metrics = calculate_clustering_metrics(embeddings, labels)
    return labels, metrics

In [None]:
pca = PCA(n_components=0.95)
reduced_embeddings = pca.fit_transform(search_engine.codes_embedding)
print(f"Reduced to {reduced_embeddings.shape[1]} dimensions.")
# Normalize embeddings for cosine similarity approximation
normalized_embeddings = reduced_embeddings / np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)

In [None]:
# # Use the Elbow Method to find the optimal number of clusters
# wcss = []
# for i in range(1, 2000, 5):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
#     kmeans.fit(normalized_embeddings)
#     wcss.append(kmeans.inertia_)

# # Plotting the Elbow Curve
# plt.figure(figsize=(10, 5))
# plt.plot(range(1, 2000, 5), wcss, marker='o', linestyle='--')  # Adjusted step size to 5
# plt.title('Elbow Method for Optimal Clusters')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

In [None]:
labels, metrics = run_kmeans_clustering(normalized_embeddings, n_clusters=23)
print("K-Means Metrics:", metrics)

In [None]:
labels, metrics = run_dbscan_clustering(normalized_embeddings, eps=0.5, min_samples=5)
print("DBSCAN Metrics:", metrics)

In [None]:
labels, metrics = run_spectral_clustering(normalized_embeddings, n_clusters=23)
print("Spectral Clustering Metrics:", metrics)

In [None]:
labels, metrics = run_agglomerative_clustering(normalized_embeddings, n_clusters=23)
print("Agglomerative Clustering Metrics:", metrics)

In [None]:
labels, metrics = run_mean_shift_clustering(normalized_embeddings)
print('Mean Shift Clustering Metrics:', metrics)

## Jina

https://huggingface.co/jinaai/jina-embeddings-v2-base-en