In [1]:
import pandas as pd
from tqdm.notebook import tqdm, trange
import numpy as np

In [2]:
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval().to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [3]:
data = pd.read_csv('result.csv', index_col=0)
data

Unnamed: 0,filename,method,code_type
0,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset_one_type(code_type):\n i...,PYTHON
1,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset():\n dfs = []\n for c...,PYTHON
2,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_class_body(lines):\n in_class = Fal...,PYTHON
3,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON
4,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.c...,PYTHON
5,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON
6,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.p...,PYTHON
7,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,"def print_hello_world():\n print(""Hello wor...",PYTHON
8,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def print_hello_world_many_times(n):\n for ...,PYTHON
9,c:\Users\79138\Mynka\no-hw\voice helper\src\ut...,def get_files(format):\n files = []\n fo...,PYTHON


In [4]:
print(data.loc[7, 'method'])

def print_hello_world():
    print("Hello world!")


In [5]:
data['length'] = [len(tokenizer(elem)['input_ids']) for elem in tqdm(data['method'])]
data = data[(data.length > 3) & (data.length < 512)].reset_index(drop=True)
data

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,filename,method,code_type,length
0,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset_one_type(code_type):\n i...,PYTHON,54
1,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset():\n dfs = []\n for c...,PYTHON,109
2,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_class_body(lines):\n in_class = Fal...,PYTHON,169
3,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,315
4,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.c...,PYTHON,223
5,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,216
6,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.p...,PYTHON,197
7,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,"def print_hello_world():\n print(""Hello wor...",PYTHON,19
8,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def print_hello_world_many_times(n):\n for ...,PYTHON,39
9,c:\Users\79138\Mynka\no-hw\voice helper\src\ut...,def get_files(format):\n files = []\n fo...,PYTHON,208


In [6]:
with torch.no_grad():
    data['emb'] = [model(**tokenizer(elem, return_tensors='pt'))['pooler_output'].detach().cpu()[0] for elem in tqdm(data['method'])]

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
from torch.nn import CosineSimilarity
cos = CosineSimilarity(dim=0)

In [25]:
query = "stop spending my time"
with torch.no_grad():
    query = model(**tokenizer(query, return_tensors='pt'))['pooler_output'].detach()[0]

In [26]:
sims = []
for i in tqdm(data.emb):
    sims.append(float(cos(query, i)))

  0%|          | 0/10 [00:00<?, ?it/s]

In [27]:
res = data.iloc[np.argsort(sims)[:-12:-1]].reset_index(drop=True)
res['sim'] = np.array(sims)[np.argsort(sims)[:-12:-1]]
res

Unnamed: 0,filename,method,code_type,length,emb,sim
0,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,"def print_hello_world():\n print(""Hello wor...",PYTHON,19,"[tensor(0.4202), tensor(-0.4082), tensor(-0.58...",0.990421
1,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def print_hello_world_many_times(n):\n for ...,PYTHON,39,"[tensor(0.5079), tensor(-0.4202), tensor(-0.59...",0.981166
2,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset_one_type(code_type):\n i...,PYTHON,54,"[tensor(0.5141), tensor(-0.5052), tensor(-0.62...",0.971706
3,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def create_dataset():\n dfs = []\n for c...,PYTHON,109,"[tensor(0.5184), tensor(-0.5192), tensor(-0.64...",0.948649
4,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_class_body(lines):\n in_class = Fal...,PYTHON,169,"[tensor(0.5716), tensor(-0.5451), tensor(-0.66...",0.934011
5,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.p...,PYTHON,197,"[tensor(0.5544), tensor(-0.5402), tensor(-0.69...",0.923922
6,c:\Users\79138\Mynka\no-hw\voice helper\src\ut...,def get_files(format):\n files = []\n fo...,PYTHON,208,"[tensor(0.5406), tensor(-0.5429), tensor(-0.66...",0.918905
7,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,216,"[tensor(0.5321), tensor(-0.5386), tensor(-0.69...",0.915369
8,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_all_data():\n files = get_files('.c...,PYTHON,223,"[tensor(0.5559), tensor(-0.5357), tensor(-0.68...",0.914659
9,c:\Users\79138\Mynka\no-hw\voice helper\src\cr...,def get_methods(lines):\n functions = []\n ...,PYTHON,315,"[tensor(0.5400), tensor(-0.5345), tensor(-0.70...",0.906168


In [29]:
print(res.iloc[5]['method'])

def get_all_data():
    files = get_files('.py')
    methods = []
    filenames = []
    for file in files:
        with file.open('r') as f:
            lines = f.read()
        lines = lines.split('\n')
        lines = [line for line in lines if len(line) > 0]
        plus = get_methods(lines)
        methods += plus
        filenames += [str(file)] * len(plus)
    return pd.DataFrame({'filename': filenames, 'method': methods})


In [30]:
import torch
from transformers import CodeBERTForCodeGeneration, CodeBERTTokenizer

# Загрузка модели и токенайзера
model = CodeBERTForCodeGeneration.from_pretrained('huggingface/codebert-base-c')
tokenizer = CodeBERTTokenizer.from_pretrained('huggingface/codebert-base-c')

# Преобразование входных данных в тензоры
prompt = "print('Hello, world!')"
input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)  # Добавление батч-размерности

# Генерация кода
output = model.generate(input_ids)
generated_code = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_code)

ImportError: cannot import name 'CodeBERTForCodeGeneration' from 'transformers' (c:\Users\79138\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\__init__.py)

In [31]:
import random

# Список слов, из которых будет формироваться стих
words = ['суши', 'роллы', 'нори', 'тунец', 'рис', 'вкусное', 'страсть', 'любовь', 'сытый', 'счастливый']

# Формируем стих, состоящий из трех строк
verse = []
for i in range(3):
    # Формируем строку, состоящую из 4 случайных слов
    line = []
    for j in range(4):
        line.append(random.choice(words))
    # Объединяем слова в строку с помощью пробела и добавляем знак переноса строки
    verse.append(' '.join(line) + '\n')

# Выводим стих на экран
print(''.join(verse))

любовь сытый страсть суши
суши любовь роллы тунец
страсть суши рис страсть



In [32]:
import random

# Список слов, из которых будет формироваться стих
words = ['суши', 'роллы', 'нори', 'тунец', 'рис', 'вкусное', 'страсть', 'любовь', 'сытый', 'счастливый']

# Формируем стих, состоящий из случайного количества строк
verse = []
num_lines = random.randint(3, 6)  # Количество строк будет случайным в диапазоне от 3 до 6
for i in range(num_lines):
    # Формируем строку, состоящую из 4 случайных слов
    line = []
    for j in range(4):
        line.append(random.choice(words))
    # Объединяем слова в строку с помощью пробела и добавляем знак переноса строки
    verse.append(' '.join(line) + '\n')

# Разделяем стих на строфы
num_stanzas = random.randint(2, 4)  # Количество строф будет случайным в диапазоне от 2 до 4
stanza_length = num_lines // num_

NameError: name 'num_' is not defined

In [39]:
import random

# English words
english_adjectives = ['delicious', 'tasty', 'juicy', 'flavorful', 'aromatic', 'savory']
english_nouns = ['sushi', 'roll', 'nigiri', 'maki', 'temaki', 'udon']
english_verbs = ['enjoy', 'devour', 'savor', 'relish', 'feast on', 'indulge in']
english_adverbs = ['slowly', 'thoughtfully', 'delightfully', 'tenderly', 'eagerly']
english_prepositions = ['with', 'on', 'in', 'under', 'beside']
english_endings = ['dine', 'fine', 'nine', 'shine', 'mine', 'vine']

# Russian words
russian_adjectives = ['вкусный', 'ароматный', 'сочный', 'пикантный', 'восхитительный']
russian_nouns = ['суши', 'ролл', 'нигири', 'маки', 'темаки', 'удон']
russian_verbs = ['наслаждаться', 'полностью разжириться', 'смаковать', 'получать удовольствие', 'поесть до отвала', 'разгуляться']
russian_adverbs = ['медленно', 'задумчиво', 'радостно', 'нежно', 'с жадностью']
russian_prepositions = ['с', 'на', 'в', 'под', 'рядом с']
russian_endings = ['покушать', 'отлично', 'девять', 'светить', 'мое', 'виноградное']

def get_language(language):
  if language == 'english':
    return (english_adjectives, english_nouns, english_verbs, english_adverbs, english_prepositions, english_endings)
  elif language == 'russian':
    return (russian_adjectives, russian_nouns, russian_verbs, russian_adverbs, russian_prepositions, russian_endings)
  else:
    raise ValueError("Invalid language specified")

def get_word_order(language):
  (adjectives, nouns, verbs, adverbs, prepositions, endings) = get_language(language)
  word_orders = [
    (random.choice(adjectives), random.choice(nouns), random.choice(verbs), random.choice(nouns)),
    (random.choice(nouns), random.choice(verbs), random.choice(adverbs), random.choice(prepositions), random.choice(nouns)),
    (random.choice(verbs), random.choice(prepositions), random.choice(adjectives), random.choice(nouns)),
  ]
  return random.choice(word_orders)

def generate_line(language, is_last_line):
  (adjectives, nouns, verbs, adverbs, prepositions, endings) = get_language(language)
  words = get_word_order(language)
  ending = random.choice(endings)

  if len(words) == 4:
    if is_last_line:
      line = f"{words[0].capitalize()} {words[1]}, {words[2]} the {words[3]} so {ending}"
    else:
      line = f"{words[0].capitalize()} {words[1]}, {words[2]} the {words[3]} so {ending},"
  else:
    if is_last_line:
      line = f"{words[0].capitalize()} {words[1]}, {words[2]} {words[3]} {words[4]} so {ending}"
    else:
      line = f"{words[0].capitalize()} {words[1]}, {words[2]} {words[3]} {words[4]} so {ending},"

  return line

def generate_poem(language, num_lines=4):
  poem = ""
  for i in range(num_lines):
    is_last_line = (i == num_lines - 1)
    poem += generate_line(language, is_last_line) + "\n"
  return poem

# Generate a poem in English
print(generate_poem('english'))

# Generate a poem in Russian
print(generate_poem('russian'))

Relish with, tasty the udon so vine,
Flavorful nigiri, devour the udon so vine,
Relish with, aromatic the maki so nine,
Udon enjoy, eagerly beside maki so fine

Смаковать с, пикантный the удон so девять,
Удон получать удовольствие, задумчиво под суши so покушать,
Суши смаковать, с жадностью на суши so покушать,
Маки наслаждаться, задумчиво под удон so отлично



In [40]:
import transformers
import torch
from scipy.spatial.distance import cosine

# Load the CodeBERT model
model = transformers.BertModel.from_pretrained("microsoft/codebert-base")

# Define the natural language query
query = "How do I add two numbers in Python?"

# Tokenize the query using the CodeBERT tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained("microsoft/codebert-base")
query_tokens = tokenizer.tokenize(query)

# Convert the query tokens to token IDs and create a PyTorch tensor
query_ids = tokenizer.convert_tokens_to_ids(query_tokens)
query_tensor = torch.tensor([query_ids])

# Define a list of code samples
code_samples = [
    "def add(x, y): return x + y",
    "x + y",
    "sum([x, y])",
    "x.add(y)",
    "x, y = y, x"
]

# Tokenize and pad the code samples using the CodeBERT tokenizer
encoded_samples = tokenizer.encode(code_samples, padding=True, return_tensors='pt')

# Extract the token IDs and attention mask from the encoded sequences
code_tensor = encoded_samples['input_ids']
attention_mask = encoded_samples['attention_mask']

# Use the model to get the embeddings for the query and code samples
query_embedding = model(query_tensor, attention_mask=torch.ones_like(query_tensor))[0][:, 0, :]
code_embeddings = model(code_tensor, attention_mask=attention_mask)[0][:, 0, :]

# Compare the query embedding to the code sample embeddings using cosine similarity
similarities = []
for code_embedding in code_embeddings:
    similarity = 1 - cosine(query_embedding[0], code_embedding)
    similarities.append(similarity)

# Sort the code samples by similarity and return the top N most similar code samples
N = 3
top_samples = [code_samples[i] for i in sorted(range(len(similarities)), key=lambda k: similarities[k], reverse=True)[:N]]
print(top_samples)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

In [11]:
import numpy as np


A = np.array([
    [16,  3,  2, 13],
    [ 5, 10, 11,  8],
    [ 9,  6,  7, 12],
    [ 4, 15, 14,  1]
])


A_cubed = np.linalg.matrix_power(A, 5)
print(A_cubed.sum(axis=1))
print(A_cubed.sum(axis=0))
print(sum([A_cubed[i, i] for i in range(A_cubed.shape[0])]))
print(sum([A_cubed[i, A_cubed.shape[0] - 1 - i] for i in range(A_cubed.shape[0])]))

[45435424 45435424 45435424 45435424]
[45435424 45435424 45435424 45435424]
45435424
45435424
