In [None]:
!pip install transformers
!pip install git+https://github.com/ericsuh/dirichlet.git # dirichlet mle

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
Downloading...
From: https://drive.google.com/uc?id=1IW114GjRUEdnawID8HuNtVuxd5cYGf1q
To: /kaggle/working/dataset_metrics_v2.csv
100%|██████████████████████████████████████| 49.6M/49.6M [00:04<00:00, 11.1MB/s]
Collecting git+https://github.com/ericsuh/dirichlet.git
  Cloning https://github.com/ericsuh/dirichlet.git to /tmp/pip-req-build-rw1s0aq1
  Running command git clone --filter=blob:none --quiet https://github.com/ericsuh/dirichlet.git /tmp/pip-req-build-rw1s0aq1
  Resolved https://github.com/ericsuh/dirichlet.git to commit 8e832cc55ced9150e30ea3a7402f594896c5a527
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: dirichlet
  Building wheel for dirichlet (setup.py) ... [?25ldone
[?25h  Created wheel for dirichlet: filename=dirichlet-0.9-py3-none-any.whl size=7352 sha256=049d4effd9b0614f3d20a16592b4dcecc49f1

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import dirichlet
import re
from html.parser import HTMLParser
import torch
import torch.nn as nn
import torch.nn.functional as f
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B",
                                          trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B",
                                             output_hidden_states=True,
                                             trust_remote_code=True,
                                             revision="main").to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

In [None]:
ds = pd.read_csv("dataset.csv")

In [None]:
class HTMLtoPromptsConverter(HTMLParser):
    def __init__(self):
        self.s = ""
        super().__init__()

    def refresh_status(self):
        self.s = ""

    def get_status(self):
        return self.s

    def handle_endtag(self, tag):
        if tag in ('p', 'pre') and (len(self.s) > 0 and self.s[-1] != '\n'):
            self.s += "\n"

    def handle_data(self, data):
        self.s += data.rstrip('\n') + ('\n' if data[-1] == '\n' else '')

In [None]:
converter = HTMLtoPromptsConverter()
data = []

for q in tqdm(range(ds.shape[0])):
    converter.refresh_status()
    converter.feed(ds.iloc[q].Question)
    data.append({})
    data[-1]["Question"] = converter.get_status()
    converter.refresh_status()
    converter.feed(ds.iloc[q].Answer)
    data[-1]["Answer"] = converter.get_status()

100%|██████████| 27705/27705 [00:19<00:00, 1455.33it/s]


In [None]:
def convert_spherical(input):
    r2 = (input**2).sum(dim=1)
    angles = torch.acos(torch.clip(
                            input[:, :-1] / torch.flip(torch.sqrt(
                                torch.cumsum(
                                    torch.flip(input**2, dims=(1,)),
                                    dim=1)[:, 1:]
                            ), dims=(1,)), min=-1, max=1))
    angles[input[:, -1] < 0] = 2 * np.pi - angles[input[:, -1] < 0]
    return torch.concat((torch.sqrt(r2).unsqueeze(dim=1), angles), dim=1)

def convert_cartesian(input):
    a = torch.concat((torch.tensor([2*np.pi] * input.size()[0]).unsqueeze(dim=1), input[:, 1:]), dim=1)
    sin = torch.sin(a)
    sin[:, 0] = 1
    sin = torch.cumprod(sin, dim=1)
    cos = torch.cos(a)
    cos = torch.roll(cos, -1)
    return sin*cos*input[:, 0].unsqueeze(dim=1)

def get_dirichlet_distribution_features(q_emb, a_emb, prefix, normalization_function):
    """
    normalization_function: torch.tensor of size (N, M) -> torch.tensor of size (N, M)
    transform distribution from R^n to [-1, 1]^n
    """
    dirichlet_features = {}
    q_dirichlet_params = torch.as_tensor(dirichlet.mle((normalization_function(q_emb).squeeze().numpy() + 1) / 2))
    a_dirichlet_params = torch.as_tensor(dirichlet.mle((normalization_function(a_emb).squeeze().numpy() + 1) / 2))
    dirichlet_features[prefix + "DirichletParamsL1Dist"] = torch.abs(q_dirichlet_params - a_dirichlet_params).sum().item()
    dirichlet_features[prefix + "DirichletParamsL2Dist"] = ((q_dirichlet_params - a_dirichlet_params)**2).sum().item()
    dirichlet_features[prefix + "DirichletQAKLDiveregence"] = torch.distributions.kl.kl_divergence(torch.distributions.dirichlet.Dirichlet(q_dirichlet_params),
                                            torch.distributions.dirichlet.Dirichlet(a_dirichlet_params)).item()
    dirichlet_features[prefix + "DirichletAQKLDiveregence"] = torch.distributions.kl.kl_divergence(torch.distributions.dirichlet.Dirichlet(a_dirichlet_params),
                                                                    torch.distributions.dirichlet.Dirichlet(q_dirichlet_params)).item()
    return dirichlet_features

def get_norm_diag_distribution_features(q_emb, a_emb):
    gaussian_features = {}
    q_mean = q_emb.mean(dim=0)
    q_dimwise_var = q_emb.var(dim=0, correction=1)
    a_mean = a_emb.mean(dim=0)
    a_dimwise_var = a_emb.var(dim=0, correction=1)
    gaussian_features["DiagNormQAKLDiveregence"] = 0.5 * (
                                                        ((a_mean - q_mean) * torch.pow(a_dimwise_var, -1) * (a_mean - q_mean)).sum().item() +
                                                        (torch.pow(a_dimwise_var, -1) * q_dimwise_var).sum().item() +
                                                        torch.log(a_dimwise_var).sum().item() - torch.log(q_dimwise_var).sum().item() - q_emb.size(dim=1)
                                                    )
    gaussian_features["DiagNormAQKLDiveregence"] = 0.5 * (
                                                        ((q_mean - a_mean) * torch.pow(q_dimwise_var, -1) * (q_mean - a_mean)).sum().item() +
                                                        (torch.pow(q_dimwise_var, -1) * a_dimwise_var).sum().item() +
                                                        torch.log(q_dimwise_var).sum().item() - torch.log(a_dimwise_var).sum().item() - q_emb.size(dim=1)
                                                    )
    return gaussian_features

def get_sample_features(q_emb, a_emb):
    """
    get_sample_features return a few relevance metrics
    :q_emb: torch.tensor of size (N, M)
    :a_emb: torch.tensor of size (N, M)
    :return: dictionary of metrics
    """
    sample_features = {}
    sample_features["AvgCosineSimilarity"] = f.cosine_similarity(torch.unsqueeze(q_emb.mean(dim=0), 0),
                                                        torch.unsqueeze(a_emb.mean(dim=0), 0),
                                                        dim=1,
                                                        eps=1e-8).item()
    sample_features["AvgL1NormCosineSimilarity"] = f.cosine_similarity(torch.unsqueeze(f.normalize(q_emb, p=1, dim=1).mean(dim=0), 0),
                                                               torch.unsqueeze(f.normalize(a_emb, p=1, dim=1).mean(dim=0), 0),
                                                               dim=1,
                                                               eps=1e-8).item()
    sample_features["AvgL2NormCosineSimilarity"] = f.cosine_similarity(torch.unsqueeze(f.normalize(q_emb, p=2, dim=1).mean(dim=0), 0),
                                                               torch.unsqueeze(f.normalize(a_emb, p=2, dim=1).mean(dim=0), 0),
                                                               dim=1,
                                                               eps=1e-8).item()

    q_spherical_coordinates = convert_spherical(q_emb).sum(dim=0).unsqueeze(dim=0)
    q_spherical_coordinates[:, 1:] = q_spherical_coordinates[:, 1:] / q_emb.size(dim=0)
    q_spherical_mean = convert_cartesian(q_spherical_coordinates)
    a_spherical_coordinates = convert_spherical(a_emb).sum(dim=0).unsqueeze(dim=0)
    a_spherical_coordinates[:, 1:] = a_spherical_coordinates[:, 1:] / a_emb.size(dim=0)
    a_spherical_mean = convert_cartesian(a_spherical_coordinates)
    sample_features["SphericalAvgL1Dist"] = torch.abs(q_spherical_mean - a_spherical_mean).sum().item()
    sample_features["SphericalAvgL2Dist"] = ((q_spherical_mean - a_spherical_mean)**2).sum().item()
    sample_features["SphericalAvgCosineSimilarity"] = f.cosine_similarity(q_spherical_mean,
                                                                        a_spherical_mean,
                                                                        dim=1,
                                                                        eps=1e-8).item()

    sample_features.update(get_dirichlet_distribution_features(q_emb, a_emb, "L2Norm", lambda x: f.normalize(x, p=2, dim=1)))
    sample_features.update(get_dirichlet_distribution_features(q_emb, a_emb, "ArctanNorm", lambda x: torch.atan(x) / (np.pi/2 + 1e-8)))

    sample_features.update(get_norm_diag_distribution_features(q_emb, a_emb))
    return sample_features

In [None]:
model.eval()
for t in tqdm(range(len(data))):
    with torch.no_grad():
        try:
            q_ids = tokenizer(data[t]["Question"], return_tensors="pt", max_length=2048, truncation=True).input_ids.to(device)
            a_ids = tokenizer(data[t]["Answer"], return_tensors="pt", max_length=2048, truncation=True).input_ids.to(device)
            q_emb = model(q_ids)["hidden_states"][-1].to("cpu").squeeze()
            a_emb = model(a_ids)["hidden_states"][-1].to("cpu").squeeze()
            data[t] = get_sample_features(q_emb, a_emb)
        except Exception as error:
            print(f"Error acquired in sample {t}: {error}")

  1%|          | 264/27705 [02:49<16:36:49,  2.18s/it]

Error acquired in sample 263: Failed to converge after 1000 iterations, s is 5995.86865234375


  4%|▍         | 1202/27705 [11:59<21:41:26,  2.95s/it]

Error acquired in sample 1201: Failed to converge after 1000 iterations, s is 7121.99462890625


  6%|▌         | 1709/27705 [17:17<16:30:45,  2.29s/it]

Error acquired in sample 1708: Failed to converge after 1000 iterations, s is 8238.1064453125


  7%|▋         | 2002/27705 [20:24<17:30:30,  2.45s/it]

Error acquired in sample 2001: Failed to converge after 1000 iterations, s is 7287.205078125


  7%|▋         | 2003/27705 [20:31<26:28:14,  3.71s/it]

Error acquired in sample 2002: Failed to converge after 1000 iterations, s is 7287.205078125


  7%|▋         | 2004/27705 [20:37<32:32:49,  4.56s/it]

Error acquired in sample 2003: Failed to converge after 1000 iterations, s is 7287.205078125


  9%|▉         | 2531/27705 [26:23<17:04:34,  2.44s/it]

Error acquired in sample 2530: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2532/27705 [26:29<25:16:38,  3.61s/it]

Error acquired in sample 2531: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2533/27705 [26:35<30:33:22,  4.37s/it]

Error acquired in sample 2532: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2534/27705 [26:42<34:38:27,  4.95s/it]

Error acquired in sample 2533: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2535/27705 [26:48<37:42:51,  5.39s/it]

Error acquired in sample 2534: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2536/27705 [26:55<41:04:56,  5.88s/it]

Error acquired in sample 2535: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2537/27705 [27:02<44:15:08,  6.33s/it]

Error acquired in sample 2536: Failed to converge after 1000 iterations, s is 11111.3486328125


  9%|▉         | 2538/27705 [27:09<44:14:12,  6.33s/it]

Error acquired in sample 2537: Failed to converge after 1000 iterations, s is 11111.3486328125


In [None]:
ds = ds.join(pd.DataFrame(data).drop(columns=["Question", "Answer"]), how="inner")

In [None]:
ds.columns

Index(['QuestionId', 'AcceptedAnswerId', 'QuestionScore', 'Question', 'Tags',
       'AnswerId', 'AnswerScore', 'Answer', 'LogicalCodeLines',
       'SourceCodeLines', 'Multilines', 'NumberOfFunctions',
       'CyclomaticComplexity', 'DistinctOperators', 'DistinctOperands',
       'TotalOperators', 'TotalOperands', 'Vocabulary', 'LogicalLength',
       'CalculatedLength', 'Volume', 'Difficulty', 'Effort',
       'NumberOfSnippets', 'TotalCodeLines', 'AvgSnippetCodeLines',
       'MaintainabilityIndexCodeLinesAvg', 'MaintainabilityIndexUniformAvg',
       'AvgCosineSimilarity', 'AvgL1NormCosineSimilarity',
       'AvgL2NormCosineSimilarity', 'SphericalAvgL1Dist', 'SphericalAvgL2Dist',
       'SphericalAvgCosineSimilarity', 'L2NormDirichletParamsL1Dist',
       'L2NormDirichletParamsL2Dist', 'L2NormDirichletQAKLDiveregence',
       'L2NormDirichletAQKLDiveregence', 'ArctanNormDirichletParamsL1Dist',
       'ArctanNormDirichletParamsL2Dist', 'ArctanNormDirichletQAKLDiveregence',
       'A

In [None]:
ds.to_csv("dataset.csv", index=False)