In [20]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import pandas as pd
import numpy

In [21]:
def get_embedding_for_a_code_snippet(code_snippet) -> numpy.array:
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model = RobertaModel.from_pretrained("microsoft/codebert-base")

    # Tokenizează codul Python
    inputs = tokenizer(code_snippet, return_tensors="pt")

    # Obține embedding-uri folosind modelul CodeBERT
    with torch.no_grad():
        outputs = model(**inputs)

    # Extrage embedding-urile
    embedding = outputs.last_hidden_state.mean(dim=1).numpy()
    return embedding

In [22]:
# Exemplu de cod Python
code_snippet = """
def add(a, b):
    return a + b
"""

print(get_embedding_for_a_code_snippet(code_snippet))

[[-4.21587586e-01  5.36854155e-02  9.84746441e-02 -8.52071494e-03
  -1.69713825e-01 -7.04039395e-01  5.58284633e-02  3.74027133e-01
   4.20846671e-01  3.17283541e-01 -1.05989158e-01  6.57337427e-01
  -3.31454843e-01 -1.56138495e-01  8.09300601e-01  5.24319224e-02
   2.31235430e-01  2.34686777e-01  1.75770223e-02  3.27220261e-01
  -9.47753787e-02 -5.38964868e-01  7.52315879e-01 -2.91860908e-01
   5.93987286e-01  2.03578666e-01  5.09097464e-02  8.36718380e-01
  -2.63034582e-01  8.14038754e-01 -1.94777250e-01 -3.90525609e-02
   1.49823809e+00 -1.65551469e-01  3.89765590e-01 -1.86520472e-01
  -2.26509944e-01  1.93701431e-01  9.76766124e-02 -3.78127933e-01
  -1.76276729e-01  3.44369084e-01 -9.64827478e-01 -2.09607840e-01
   4.96156842e-01  2.64273882e-01  3.65866721e-01  3.10826749e-02
   3.52302071e-04  4.47413594e-01  3.80040616e-01  2.87721336e-01
  -7.49138713e-01 -2.20499009e-01  1.51437625e-01  3.16667199e-01
  -1.11746275e+00 -5.35684407e-01 -2.04062134e-01 -3.93559694e-01
  -3.93453

In [23]:
def citire_code_snippet_din_fisier(file_path) -> str:
    with open(file_path, 'r') as file:
        code_snippet = file.read()
    return code_snippet

In [24]:
def citire_coduri() -> pd.DataFrame:
    df = pd.read_csv('scop.csv')
    return df

In [27]:
from numpy.random import choice
from math import sqrt

class MyKMeans():
    def __init__(self, nr_centroizi) -> None:
        self.nr_centroizi = nr_centroizi
        self.centroizi = []

    def initialize_centroizi(self, input):
        index = [i for i in range(input.shape[0])]
        indexes = choice(index, self.nr_centroizi)
        self.centroizi = [input[i] for i in indexes]

    def distance(self, v1, v2):
        values = [(v1[0,i] - v2[0,i])*(v1[0,i] - v2[0,i])for i in range(v1.shape[1])]
        return sqrt(sum(values))

    def minimum_centroid_index(self, x):
        indice = 0
        minim_distance = self.distance(x,self.centroizi[indice])
        for i in range(len(self.centroizi)):
            distance = self.distance(x,self.centroizi[i])
            if distance < minim_distance:
                minim_distance = distance
                indice = i
        
        return indice
    
    def _upper_sum(self, input, c, j):
        return sum([input[i] for i in range(input.shape[0]) if c[i] == j])
    
    def _lowwer_sum(self, c, j):
        return sum([1 for i in range(len(c)) if c[i] == j])

    def train(self, training_input):
        self.initialize_centroizi(training_input)

        convergent = False

        while not convergent:
            c = []
            for i in range(training_input.shape[0]):
                x = training_input[i]
                indice = self.minimum_centroid_index(x)
                c.append(indice)

            max_change = -1
            for j in range(0, self.nr_centroizi):
                new_centroid = self._upper_sum(training_input,c,j) / self._lowwer_sum(c,j)
                distance = self.distance(self.centroizi[j],new_centroid)
                if distance > max_change:
                    max_change = distance
                self.centroizi[j]=new_centroid

            if max_change < 0.04:
                convergent = True
        
    def predict(self, input):
        indexes = [self.minimum_centroid_index(x) for x in input]
        return indexes

In [31]:
def get_clasificator():
    dataframe = citire_coduri()
    set_scopuri = set([scop for scop in dataframe['scop']])
    scopuri = [scop for scop in set_scopuri]

    embeddinguri = [get_embedding_for_a_code_snippet(citire_code_snippet_din_fisier(code_snippet_path)) for code_snippet_path in dataframe["code_snippet"]]

    clasificator = MyKMeans(nr_centroizi=len(scopuri))
    clasificator.train(numpy.array(embeddinguri))
    return clasificator
    

In [55]:
code_snip = """
def f(a):
    p = 1
    for i in range(1,a):
        p = p * i    

    return p
"""

In [36]:
clasificator = get_clasificator()

In [57]:

dataframe = citire_coduri()
set_scopuri = set([scop for scop in dataframe['scop']])
scopuri = [scop for scop in set_scopuri]
r = clasificator.predict(numpy.array([get_embedding_for_a_code_snippet(code_snip)]))[0]

print(scopuri[r])

Algoritmi pe cifrele unui numar


# PB 2

In [19]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def generate_comments_with_model(code):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    inputs = tokenizer.encode("For this python code, write only specifications\n" + code, return_tensors="pt", max_length=512, truncation=True)
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, temperature=0.8, do_sample=True, top_k=50)
    generated_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_comment

python_code_without_comments = """
def calculate_square(x):
    return x ** 2
"""
generated_comment = generate_comments_with_model(python_code_without_comments)
print(generated_comment)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


For this python code, write only specifications

def calculate_square(x):
    return x ** 2

""" compute square from the data. If it is 1, it is a 1.

If it is not, write it as a string. If x < 0, it is 0.

"""

def sum(x):

""" sum with the sum as the result. If x < 0, it is 1.

"""

def sum_of(x, x_1, x_2, x_3, x_4, x_5 = 0):

""" sum together all the variables in x_1_values with the sum as the result


# Pb 3

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Daoguang/PyCodeGPT")
model = AutoModelForCausalLM.from_pretrained("Daoguang/PyCodeGPT")
# Define the problem description
problem_description = """# Write a Python function that adds the first 10 numbers
"""

# Encode the input with attention mask
inputs = tokenizer(problem_description, return_tensors="pt", padding=True)

# Generate code
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=100, num_return_sequences=1)

# Decode the generated code
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_code)


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# Write a Python function that adds the first 10 numbers
def add(n1, n2):
    return n1 + n2

# Write a Python function that adds the first 10 numbers
def add2(n1, n2):
    return n1 + n2 + n1

# Write a Python function that adds the first 10 numbers
def add3(n1, n2):
    return n1 + n2 + n1 + n2

