In [1]:
from google.colab import files
files.upload()

ModuleNotFoundError: No module named 'google.colab'

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: False


In [None]:
!pip -q install transformers tqdm scikit-learn pandas numpy

In [None]:
!mkdir -p csv_data
!mv sample_train.csv csv_data/

In [5]:
# import necessary libraries
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# function to compute cosine similarity between two code snippets
def compute_cosine_similarity(code1, code2) -> float:

    # Remove comments
    code1 = re.sub(r'//.*?$|/\*.*?\*/', '', code1, flags=re.DOTALL | re.MULTILINE)
    code2 = re.sub(r'//.*?$|/\*.*?\*/', '', code2, flags=re.DOTALL | re.MULTILINE)

    # Remove whitespace
    code1 = re.sub(r'\s+', ' ', code1).strip()
    code2 = re.sub(r'\s+', ' ', code2).strip()

    # Define custom token pattern for code
    tokenPattern = ""
    tokenPattern += r"[A-Za-z_][A-Za-z0-9_]*"  # Identifiers
    tokenPattern += r"|\d+"                     # Numbers
    tokenPattern += r"|==|!=|<=|>=|\+=|-=|\*=|/=|&&|\|\|"  # Multi-char operators
    tokenPattern += r"|\".*?\"|\'.*?\'"  # String literals
    tokenPattern += r"|[{}()\[\];=+\-*/<>!&|]"  # Single-char operators and punctuation

    # Define stop words common in code
    stopWords = ["include", "namespace", "using", "std", "return", "cin", "cout", "int", "float", "double", "string", "bool", "endl"]

    # Compute TF-IDF vectors
    vectorizer = TfidfVectorizer(token_pattern=tokenPattern, ngram_range=(1, 2), stop_words=stopWords, norm='l2', sublinear_tf=True)
    tfidfMatrix = vectorizer.fit_transform([code1, code2])

    # Compute cosine similarity
    similarityMatrix = cosine_similarity(tfidfMatrix[0:1], tfidfMatrix[1:2])
    similarityValue = similarityMatrix[0][0]

    # Return similarity value
    return similarityValue

# function to process training data and compute similarity for each pair
def process_token_similarity(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['token_similarity'] = df.apply(lambda row: compute_cosine_similarity(row['code1'], row['code2']), axis=1)
    return df

if __name__ == "__main__":

    df = pd.read_csv("csv_data/sample_train.csv")
    df_processed = process_token_similarity(df[['code1', 'code2', 'similar']])
    print(df_processed.head(10))

FileNotFoundError: [Errno 2] No such file or directory: 'csv_data/sample_train.csv'

In [4]:
import subprocess
import tempfile
import os
import pandas as pd
import numpy as np
import json
import ast
import math
import time
from typing import Any
from difflib import SequenceMatcher
from tqdm import tqdm

# Inputs
inputs = [
    "hello\nolleh\n",
    "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n89\n45\n109\n420\n39\n1\n0\n60\n",
    "100\n99\n98\n97\n96\n95\n94\n93\n92\n91\n90\n",
    "4\nword1\nword2\nword3\nword4\n",
    "cat\n5\n10\n8\n4\n",
    "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n",
    "3\n5 6 7\n1 2 3\n",
    "7\n100 150 200\n10 20 30 40 50 60 70\n",
    "4.5\n10.9\n2.3\n8.9\n18.49\n29.82\n22.22\n14.00\n6.89\n",
    "-8\n-12\n-5\n-11\n-4\n-2\n-32\n-450\n-20\n-89\n"
    "apple\nbanana\ncat\ndog\nzebra\nalpha\nomega\ntest\nhello\nworld\n"
]
inputs = [x.encode('utf-8') for x in inputs]

# Compare String
def _string_similarity(a: str, b: str) -> float:
    return SequenceMatcher(a=a, b=b).ratio()

# Check if number
def is_number(x):
    try:
        result = float(x)

        return True
    except:
        return False

# Determine the similarity of 2 numbers
def _numeric_similarity(out1: float, out2: float) -> float:
    if out1 == out2:
        return 1.0
    # NaN type
    elif math.isnan(out1) or math.isnan(out2):
        if (math.isnan(out1) and not math.isnan(out2)) or (not math.isnan(out1) and math.isnan(out2)):
            return 0.0
        else:
            return 1.0
    # Large / infinite numbers
    elif math.isinf(out1) or math.isinf(out2):
        if (math.isinf(out1) and not math.isinf(out2)) or (not math.isinf(out1) and math.isinf(out2)):
            return 0.0
        else:
            return 1.0


    # Find absolute distance between highest and smallest number
    out1 = abs(out1)
    out2 = abs(out2)
    highest = max(out1, out2)

    # Avoid division by 0
    if highest == 0:
        highest = 0.1
    smallest = min(out1, out2)
    distance = abs(highest - smallest)

    # Normalize the distance the numbers to a range of [0-1]
    normalizedDistance = np.round(1 - (distance / abs(highest)), decimals=3)
    # print(f"Normalized dist {out1} {out2} {normalizedDistance}")
    return normalizedDistance

# Parse into json, list, or hash/dict
def try_parse(s):
    # Try JSON
    try:
        return json.loads(s)
    except:
        pass

    # Try Python literal (e.g. "[1,2,3]" or "{'a': 2}")
    try:
        return ast.literal_eval(s)
    except:
        pass

    # Try to break into list if separated by spaces
    try:
        if " " in s:
            return s.split()
    except:
        pass

    return s

# Compare lists
def listSimilarity(output1: list, output2: list) -> float:
    try:
        if not isinstance(output1, list) or not isinstance(output2, list):
            return 0.0
        listLength = min(len(output1), len(output2))
        if listLength <= 0:
            return 0.0

        outputScore = 0
        for i in range(0, listLength):

            o1 = try_parse(output1[i])
            o2 = try_parse(output2[i])

            if o1 is None or o2 is None:
                outputScore += noneSim(o1, o2)

            elif isinstance(o1, list) or isinstance(o2, list):
                outputScore += listSimilarity(o1, o2)

            elif isinstance(o1, dict) or isinstance(o2, dict):
                outputScore += 1.0 if o1 == o2 else 0.0

            elif is_number(o1) and is_number(o2):
                outputScore += _numeric_similarity(float(o1), float(o2))

            # Nan Character
            elif '\ufffd' in text1 or  '\ufffd' in text2:
                if ('\ufffd' in text1 and '\ufffd' not in text2) or ('\ufffd' not in text1 and '\ufffd' in text2) :
                    outputScore += 0.0
                else:
                    outputScore += 1.0

            else:
                outputScore += _string_similarity(o1, o2)

        return outputScore / listLength
    except:
        return 0.0


# Compare None types
def noneSim(out1: Any, out2: Any)-> float:

    if (out1 == 0 and out2 is None) or (out2 == 0 and out1 is None):
        return 0.9

    elif(out1 is None and out2 == '') or (out2 is None and out1 == ''):
        return 0.9

    elif out1 is None and out2 is None:
        return 1.0

    else:
        return 0.0

# Compare logic
def compare(raw1: bytes, text1: str, raw2: bytes, text2: str) -> float:
    try:

        # Compare raw bytes
        if raw1 == raw2:
            return 1.0

        # try to parse into a dict or list
        text1 = try_parse(text1)
        text2 = try_parse(text2)

        # None type
        if text1 is None or text2 is None:
            return noneSim(text1, text2)

        # List
        elif isinstance(text1, list) or isinstance(text2, list):
            return listSimilarity(text1, text2)

        # Dict
        elif isinstance(text1, dict) or isinstance(text2, dict):
            return 1.0 if text1 == text2 else 0.0

        # Numbers
        elif is_number(text1) and is_number(text2):
            return _numeric_similarity(float(text1), float(text2))

            # Nan Character
        elif '\ufffd' in text1 or  '\ufffd' in text2:
            if ('\ufffd' in text1 and '\ufffd' not in text2) or ('\ufffd' not in text1 and '\ufffd' in text2) :
                return 0.0
            else:
                return 1.0
        # String
        else:
            return _string_similarity(text1, text2)
    except:
        return 0

# Compile and run the cpp program pair
def run (fName1: str, fName2: str, code1: str, code2: str, tmpdir) -> list:

    try:
        # --- Write C++ file ---
        with open(fName1, "w") as f:
            f.write(code1)

        with open(fName2, "w") as f:
            f.write(code2)

        exe1 = fName1.replace(".cpp", "")
        exe2 = fName2.replace(".cpp", "")

        # --- Compile ---
        compile_proc1 = subprocess.run(
            ["g++", fName1, "-o", exe1],
            text=False,
            capture_output=False,
            stdin=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            check=False
        )
        # --- Compile ---
        compile_proc2 = subprocess.run(
            ["g++", fName2, "-o", exe2],
            text=False,
            capture_output=False,
            stdin=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            check=False
        )

        # Compilation failed
        if compile_proc1.returncode != 0 and compile_proc2.returncode !=0:
            return 1
        elif (compile_proc1.returncode != 0 and compile_proc2.returncode == 0) or ( compile_proc2.returncode != 0 and compile_proc1.returncode == 0):
            return 0

        outputs = 0
        # --- Run with series of inputs---
        for input in inputs:
            # Run code 1
            try:
                out1 = subprocess.run(
                    [exe1],
                    input=input,
                    capture_output=True,
                    text=False,
                    check=True,
                    timeout=0.5,
                    cwd=tmpdir
                )
                raw1 = out1.stdout
                text1 = raw1.decode("utf-8", errors="replace").replace("\n", " ")
            # Catch error in program
            except subprocess.CalledProcessError as e:
                raw1 = None
                text1 = None
            # Program was not given enough inputs
            except subprocess.TimeoutExpired as e:
                raw1 = None
                text1 = None
            # Run code2
            try:
                out2 = subprocess.run(
                    [exe2],
                    input=input,
                    capture_output=True,
                    text=False,
                    check=True,
                    timeout=0.5,
                    cwd=tmpdir
                )
                raw2 = out2.stdout
                text2 = raw2.decode("utf-8", errors="replace").replace("\n", " ")
            # Catch error in program
            except subprocess.CalledProcessError as e:
                raw2 = None
                text2 = None
            # Program was not given enough inputs
            except subprocess.TimeoutExpired as e:
                raw2 = None
                text2 = None

            outputs += compare(raw1, text1, raw2, text2)

        score = round((outputs / len(inputs)), 2)
        return score

    except Exception as err:
        print(f"Error in file running {err}")
        return 0

# Process each code pair in a pandas dataframe
def process(df: pd.DataFrame) -> pd.DataFrame:
    n = len(df)

    if n == 0:
        df["output_similarity"] = []
        return df

    results = np.zeros(n, dtype=float)

    # Run files in a sandboxed environment
    with tempfile.TemporaryDirectory() as tmpdir:
        for i in tqdm(range(0, n), desc="output similarity"):
            fName1 = os.path.join(tmpdir, f"prog_1.cpp")
            fName2 = os.path.join(tmpdir, f"prog_2.cpp")
            code1 = df['code1'].iloc[i]
            code2 =   df['code2'].iloc[i]

            outputSim = run (fName1, fName2, code1, code2, tmpdir)
            results[i] = outputSim

    df["output_similarity"] = results
    return df



In [3]:
# import necessary libraries
from transformers import RobertaModel, RobertaTokenizer
from tqdm.auto import tqdm
import torch
import numpy as np
import re
import pandas as pd
import time
from typing import Iterable, Optional

# set device and configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
use_fp16 = torch.cuda.is_available() and device.type == 'cuda'
last_n_layers_default = 4
kw_default = 0.3
default_layer_pooling = False
combine_method_default = 'prod'

# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/graphcodebert-base')
# load model with hidden states enabled for optional layer pooling
model = RobertaModel.from_pretrained('microsoft/graphcodebert-base', output_hidden_states=True)
model.to(device)
model.eval()

# function to compute keyword overlap
def keyword_overlap(code1, code2):
    keywords = [
        "for", "while", "if", "else", "return", "int", "float", "double",
        "string", "bool", "class", "def", "import", "include", "namespace",
        "using", "public", "private", "protected", "void", "static", "try",
        "catch", "switch", "case", "break", "continue"
    ]

    k1 = set([k for k in keywords if k in code1])   # Extract keywords from code1
    k2 = set([k for k in keywords if k in code2])   # Extract keywords from code2

    # Avoid division by zero
    if not k1 or not k2:
        return kw_default

    # Compute Jaccard similarity
    return len(k1.intersection(k2)) / max(len(k1), len(k2))

# function to normalize code snippets
def perform_normalization(code: str) -> str:
    # Remove comments and collapse whitespace
    code = re.sub(r'//.*?$|/\*.*?\*/', '', code, flags=re.DOTALL | re.MULTILINE)

    # remove common preprocessor/import lines
    lines = code.splitlines()
    kept = []

    for line in lines:
        stripped = line.strip()
        if stripped.startswith('#include'):
            continue
        if stripped.startswith('using namespace'):
            continue
        if stripped.startswith('typedef'):
            continue
        if stripped.startswith('#define'):
            continue

        if not stripped:
            continue
        kept.append(stripped)

    code = "\n".join(kept)

    # compress whitespace
    code = re.sub(r'\s+', ' ', code).strip()

    return code


# function for mean pooling
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    return summed / counts

# function for layer-averaged pooling
def layer_average_pooling(hidden_states, attention_mask, last_n_layers):
    layers = hidden_states[-last_n_layers:]
    pooled_layers = []
    for layer in layers:
        pooled_layer = mean_pooling(layer, attention_mask)
        pooled_layers.append(pooled_layer)
    return torch.stack(pooled_layers, dim=0).mean(dim=0)

# function to compute code embeddings
def compute_embedding(codes: Iterable[str], batch_size: int = 32, max_length: int = 64, normalize: bool = True,
                      layer_pooling: Optional[bool] = None, last_n_layers: int = 4):
    """Compute embeddings for `codes` with optional layer-averaged pooling.

    Args:
        codes: iterable of strings (or a single string will be supported by caller).
        batch_size: tokenization / model batch size.
        max_length: tokenizer max length.
        layer_pooling: if True, average the last `last_n_layers` hidden states (mean pooling per layer then average).
        last_n_layers: number of last layers to average when `layer_pooling` is True.
    Returns:
        Torch tensor of shape (N, hidden_size) on CPU (L2-normalized rows).
    """
    # allow single string too
    single = False
    single_input = isinstance(codes, str)
    if single_input:
        single = True
        codes = [codes]

    all_embs = []

    if layer_pooling is None:
        layer_pooling = default_layer_pooling

    for i in range(0, len(codes), batch_size):
        batch_texts = [perform_normalization(c) for c in codes[i:i + batch_size]]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=max_length)

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            if use_fp16:
                with torch.cuda.amp.autocast():
                    out = model(**inputs)
            else:
                out = model(**inputs)


        if layer_pooling:
            hidden_states = out.hidden_states
            embeddings = layer_average_pooling(hidden_states, inputs['attention_mask'], last_n_layers)
        else:
            embeddings = mean_pooling(out.last_hidden_state, inputs['attention_mask'])


        if normalize:
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embs.append(embeddings.cpu())


    if all_embs:
        result = torch.cat(all_embs, dim=0).numpy()
    else:
        result = np.zeros((0, model.config.hidden_size), dtype=np.float32)

    if single:
        return embeddings[0]
    return result

# function to compute semantic similarity between two code embeddings
def compute_semantic_similarity(vec1, vec2):

    # for 1d scalars
    if vec1.ndim == 1 and vec2.dim == 1:
        v1 = vec1.astype(np.float32)
        v2 = vec2.astype(np.float32)

        denominator = np.linalg.norm(v1) * np.linalg.norm(v2)
        if denominator == 0:
            return 0.0
        return float(np.dot(v1, v2) / denominator)


    # for 2d arrays
    if vec1.ndim == 1:
        vec1 = vec1[None, :]
    if vec2.ndim == 1:
        vec2 = vec2[None, :]


    v1 = vec1.astype(np.float32)
    v2 = vec2.astype(np.float32)

    v1_norms = np.linalg.norm(v1, axis=1, keepdims=True).clip(min=1e-9)
    v2_norms = np.linalg.norm(v2, axis=1, keepdims=True).clip(min=1e-9)

    v1 = v1 / v1_norms
    v2 = v2 / v2_norms

    # general case: (N, d) x (M, d) -> (N, M) matrix
    return np.matmul(v1, v2.T)

# function to process training data and compute similarity for each pair
def process_semantic_similarity(df, pair_batch_size: int = 128, embed_batch_size: int = 32, combine_method: Optional[str] = None, alpha: float = 0.85, rescale: bool = False):
    """Compute semantic similarity for each row in `df` by processing pairs in chunks.

    Args:
        df: DataFrame with columns `code1` and `code2`.
        pair_batch_size: number of pairs to process in each chunk (controls memory footprint).
        embed_batch_size: batch size passed to `compute_embedding` for tokenization/model batching.

    Returns:
        Copy of `df` with a new column `semantic_similarity`.
    """

    df = df.copy()
    n = len(df)
    if n == 0:
        df['semantic_similarity'] = []
        return df

    results = np.zeros(n, dtype=float)

    effective_combine = combine_method if combine_method is not None else combine_method_default

    # iterate over pairs in chunks
    for start in tqdm(range(0, n, pair_batch_size), desc="pair-chunks"):
        end = min(start + pair_batch_size, n)

        # normalize code snippets in the chunk
        codes1_chunk = df['code1'].iloc[start:end].astype(str).tolist()
        codes2_chunk = df['code2'].iloc[start:end].astype(str).tolist()

        # embed each chunk (embedding function itself can batch internally)
        # enable layer pooling and match optimized defaults for parity
        emb1 = compute_embedding(codes1_chunk, batch_size=embed_batch_size, layer_pooling=True, last_n_layers=4, max_length=64)
        emb2 = compute_embedding(codes2_chunk, batch_size=embed_batch_size, layer_pooling=True, last_n_layers=4, max_length=64)

        # compute semantic similarities for the chunk
        sims = compute_semantic_similarity(emb1, emb2)

        if sims.shape[0] == sims.shape[1]:
            sims = np.diagonal(sims)
        else:
            # If pair counts match, prefer diagonal; else use row-wise max
            if sims.shape[0] == (end - start) and sims.shape[1] == (end - start):
                sims = np.diag(sims)
            else:
                sims = np.max(sims, axis=1)

        kw = np.array([keyword_overlap(a, b) for a, b in zip(codes1_chunk, codes2_chunk)], dtype=float)

        print(f"Semantic sims (first 10): {sims[:10]}")
        print(f"Keyword overlaps (first 10): {kw[:10]}")
        if effective_combine == 'prod':
            combined = sims * kw
        elif effective_combine == 'avg':
            combined = 0.5 * (sims + kw)
        elif effective_combine == 'weighted':
            combined = alpha * sims + (1 - alpha) * kw
        else:
            raise ValueError(f'Unknown combine method: {effective_combine}')

        if rescale:
            combined = np.clip(combined, -1.0, 1.0)
            combined = (combined + 1.0) / 2.0

        results[start:end] = combined

    df['semantic_similarity'] = results
    return df



Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##Ensemble Layer

In [2]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
import torch.optim as optim
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
add_token_sim = process_token_similarity
add_semantic_sim = process_semantic_similarity
add_output_sim = process

# MLP
class EnsembleMLP(nn.Module):
    def __init__(self, inputSize = 3, hiddenSize = 16, outputSize = 1):
        super(EnsembleMLP, self).__init__()
        self.network = nn.Sequential(nn.Linear(inputSize,hiddenSize), nn.ReLU(), nn.Linear(hiddenSize,hiddenSize), nn.ReLU(), nn.Linear(hiddenSize,outputSize))

    def forward(self, x):
        return self.network(x)

    def predictProba(self,x):
        self.eval()
        with torch.no_grad():
            logits = self.forward(x)
            return torch.sigmoid(logits)

def buildXAndY_real(sample_train_path="sample_train.csv", limit=None):
    df = pd.read_csv(sample_train_path)

    if limit is not None:
        df = df.head(limit)

    df = df[["code1", "code2", "similar"]].copy()

    df = add_token_sim(df)
    df = add_semantic_sim(df)
    df = add_output_sim(df)

    print("Columns after similarity scripts:", df.columns.tolist())

    X = df[["token_similarity", "semantic_similarity", "output_similarity"]].astype(np.float32).to_numpy()
    y = df["similar"].astype(np.float32).to_numpy()
    return X, y, df

def trainEnsemble(X, y):
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  X_train = torch.FloatTensor(X_train)
  y_train = torch.FloatTensor(y_train)
  X_val = torch.FloatTensor(X_val)
  y_val = torch.FloatTensor(y_val)

  model = EnsembleMLP()
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.Adam(model.parameters(), lr=1e-3)

  train_losses = []
  val_losses = []
  bestValLoss = float('inf')
  patience = 10
  patienceCount = 0
  for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    logits = model(X_train)
    loss = criterion(logits.squeeze(), y_train)
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())

    model.eval()

    with torch.no_grad():
      val_logits = model(X_val)
      val_loss = criterion(val_logits.squeeze(), y_val)
      val_losses.append(val_loss.item())

    if (val_loss < bestValLoss):
      bestValLoss = val_loss
      patienceCount = 0
      torch.save(model.state_dict(), "best_model.pth")
    else:
      patienceCount += 1
    if patienceCount >= patience:
      print(f"Early stopping at {epoch}")
      break
    if epoch % 10 == 0:
      print(f'Epoch {epoch}: Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')
  model.load_state_dict(torch.load("best_model.pth"))
  return model, train_losses, val_losses

# Inference
def predict(model, X, threshold=0.7):
  model.eval()
  X = torch.FloatTensor(X)  # convert if numpy
  with torch.no_grad():
    probabilities = model.predictProba(X).squeeze()
    predictions = (probabilities >= threshold).float()
    return probabilities.numpy(), predictions.numpy()
def evaluate(y_true, y_pred):
  print("\n--- TEST RESULTS (threshold=0.7) ---")
  print("Accuracy :", accuracy_score(y_true, y_pred))
  print("Precision:", precision_score(y_true, y_pred, zero_division=0))
  print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
  print("F1       :", f1_score(y_true, y_pred, zero_division=0))
  print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

NameError: name 'process_token_similarity' is not defined

In [None]:
X, y, df_feat = buildXAndY_real("csv_data/sample_train.csv", limit=200)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# train
model, train_losses, val_losses = trainEnsemble(X_train, y_train)

# test with threshold 0.7
probs, preds = predict(model, X_test, threshold=0.7)
evaluate(y_test, preds)

pair-chunks:   0%|          | 0/2 [00:00<?, ?it/s]

Semantic sims (first 10): [0.9557072  0.9744534  0.9320595  0.95392996 0.9230163  0.9276461
 0.9539391  0.9318732  0.9069919  0.8885174 ]
Keyword overlaps (first 10): [0.75       0.88888889 0.66666667 0.45454545 0.72727273 0.5
 0.8        0.7        0.75       0.58333333]
Semantic sims (first 10): [0.9507477  0.9095962  0.9240067  0.87895566 0.94044447 0.9493807
 0.95329225 0.9216672  0.93005013 0.9818524 ]
Keyword overlaps (first 10): [0.77777778 0.8        0.66666667 0.7        0.7        0.57142857
 0.77777778 0.90909091 0.88888889 1.        ]


output similarity:   0%|          | 0/200 [00:00<?, ?it/s]

Columns after similarity scripts: ['code1', 'code2', 'similar', 'token_similarity', 'semantic_similarity', 'output_similarity']
Epoch 0: Train Loss: 0.6741, Val Loss: 0.6636
Epoch 10: Train Loss: 0.6615, Val Loss: 0.6456
Epoch 20: Train Loss: 0.6490, Val Loss: 0.6277
Epoch 30: Train Loss: 0.6356, Val Loss: 0.6084
Epoch 40: Train Loss: 0.6211, Val Loss: 0.5872
Epoch 50: Train Loss: 0.6050, Val Loss: 0.5639
Epoch 60: Train Loss: 0.5871, Val Loss: 0.5380
Epoch 70: Train Loss: 0.5666, Val Loss: 0.5082
Epoch 80: Train Loss: 0.5438, Val Loss: 0.4762
Epoch 90: Train Loss: 0.5196, Val Loss: 0.4405

--- TEST RESULTS (threshold=0.7) ---
Accuracy : 0.8
Precision: 1.0
Recall   : 0.5555555555555556
F1       : 0.7142857142857143
Confusion matrix:
 [[22  0]
 [ 8 10]]
