In [1]:
import pandas as pd
from tqdm.notebook import tqdm, trange
import numpy as np

In [2]:
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval().to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [3]:
data = pd.read_csv('result.csv', index_col=0)
data

Unnamed: 0,filename,method,code_type
0,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset_one_type(code_type):\n i...,PYTHON
1,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset():\n dfs = []\n for c...,PYTHON
2,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_class_body(lines):\n in_class = Fal...,PYTHON
3,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON
4,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.c...,PYTHON
5,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON
6,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.p...,PYTHON
7,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,"def print_hello_world():\n print(""Hello wor...",PYTHON
8,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def print_hello_world_many_times(n):\n for ...,PYTHON
9,c:\Users\79138\Mynka\no-hw\voice helper\src\ut...,def get_files(format):\n files = []\n fo...,PYTHON


In [4]:
print(data.loc[7, 'method'])

def print_hello_world():
    print("Hello world!")


In [5]:
data['length'] = [len(tokenizer(elem)['input_ids']) for elem in tqdm(data['method'])]
data = data[(data.length > 3) & (data.length < 512)].reset_index(drop=True)
data

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,filename,method,code_type,length
0,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset_one_type(code_type):\n i...,PYTHON,54
1,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset():\n dfs = []\n for c...,PYTHON,109
2,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_class_body(lines):\n in_class = Fal...,PYTHON,169
3,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,315
4,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.c...,PYTHON,223
5,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,216
6,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.p...,PYTHON,197
7,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,"def print_hello_world():\n print(""Hello wor...",PYTHON,19
8,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def print_hello_world_many_times(n):\n for ...,PYTHON,39
9,c:\Users\79138\Mynka\no-hw\voice helper\src\ut...,def get_files(format):\n files = []\n fo...,PYTHON,208


In [6]:
with torch.no_grad():
    data['emb'] = [model(**tokenizer(elem, return_tensors='pt'))['pooler_output'].detach().cpu()[0] for elem in tqdm(data['method'])]

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
from torch.nn import CosineSimilarity
cos = CosineSimilarity(dim=0)

In [25]:
query = "stop spending my time"
with torch.no_grad():
    query = model(**tokenizer(query, return_tensors='pt'))['pooler_output'].detach()[0]

In [26]:
sims = []
for i in tqdm(data.emb):
    sims.append(float(cos(query, i)))

  0%|          | 0/10 [00:00<?, ?it/s]

In [27]:
res = data.iloc[np.argsort(sims)[:-12:-1]].reset_index(drop=True)
res['sim'] = np.array(sims)[np.argsort(sims)[:-12:-1]]
res

Unnamed: 0,filename,method,code_type,length,emb,sim
0,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,"def print_hello_world():\n print(""Hello wor...",PYTHON,19,"[tensor(0.4202), tensor(-0.4082), tensor(-0.58...",0.990421
1,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def print_hello_world_many_times(n):\n for ...,PYTHON,39,"[tensor(0.5079), tensor(-0.4202), tensor(-0.59...",0.981166
2,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset_one_type(code_type):\n i...,PYTHON,54,"[tensor(0.5141), tensor(-0.5052), tensor(-0.62...",0.971706
3,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset():\n dfs = []\n for c...,PYTHON,109,"[tensor(0.5184), tensor(-0.5192), tensor(-0.64...",0.948649
4,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_class_body(lines):\n in_class = Fal...,PYTHON,169,"[tensor(0.5716), tensor(-0.5451), tensor(-0.66...",0.934011
5,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.p...,PYTHON,197,"[tensor(0.5544), tensor(-0.5402), tensor(-0.69...",0.923922
6,c:\Users\79138\Mynka\no-hw\voice helper\src\ut...,def get_files(format):\n files = []\n fo...,PYTHON,208,"[tensor(0.5406), tensor(-0.5429), tensor(-0.66...",0.918905
7,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,216,"[tensor(0.5321), tensor(-0.5386), tensor(-0.69...",0.915369
8,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.c...,PYTHON,223,"[tensor(0.5559), tensor(-0.5357), tensor(-0.68...",0.914659
9,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,315,"[tensor(0.5400), tensor(-0.5345), tensor(-0.70...",0.906168


In [29]:
print(res.iloc[5]['method'])

def get_all_data():
    files = get_files('.py')
    methods = []
    filenames = []
    for file in files:
        with file.open('r') as f:
            lines = f.read()
        lines = lines.split('\n')
        lines = [line for line in lines if len(line) > 0]
        plus = get_methods(lines)
        methods += plus
        filenames += [str(file)] * len(plus)
    return pd.DataFrame({'filename': filenames, 'method': methods})
