In [None]:
# default_exp repr.i

# Data Representation

> This module is dedicated to represent software artifacts into proper abstract structure such as vectors or graphs. Possible subfolders:
>
> - Vectorization: --Word2vec --BERT
> - Graph

In [None]:
# export
# Imports
import numpy as np

from abc import ABC, abstractmethod

from pathlib import Path

from transformers import pipeline

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
class Vectorizor(ABC):
 
    def __init__(self, vectorizor):
        self.vectorizor = vectorizor
        super().__init__()
    
    @abstractmethod
    def vectorize(self, inpt):
        pass

In [None]:
# export
class BertVectorizor(Vectorizor):
    """
        Vectorization subclass that handles vectorizing using BERT
    """
    def vectorize(self, inpt):
        return np.array(self.vectorizor("public static void main"))

In [None]:
path = Path('/tf/data/models/JavaBert-v1')

In [None]:
vectorizor = BertVectorizor(pipeline(
    "feature-extraction",
    model= str(path),
    tokenizer= str(path)
))

Model name '/tf/data/models/JavaBert-v1' was not found in model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-japanese, bert-base-japanese-whole-word-masking, bert-base-japanese-char, bert-base-japanese-char-whole-word-masking, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1, bert-base-dutch-cased, openai-gpt, transfo-xl-wt103, gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2, ctrl, xlnet-base-cased, xlnet-large-cased, xlm-mlm-en-2048, xlm-mlm-ende-1024, xlm-mlm-enfr-1024, xlm-mlm-enro-1024, xlm-mlm-tlm-xnli15-1024, xlm-mlm-xnli15-1024, xlm-clm-enfr-1

In [None]:
vectorizor.vectorize("public static void main")

array([[[-0.06092769,  0.06083986,  0.00415461, ..., -0.20505397,
         -0.07404258,  0.00412495],
        [-0.14041282, -0.30901086,  0.15790603, ..., -0.81615841,
          0.20173268, -0.22748585],
        [-0.14098363, -0.10925935,  0.05055226, ..., -1.0249418 ,
          0.1391564 , -0.17706262],
        [ 0.03331072, -0.69691283,  0.06129989, ..., -0.69145656,
          0.21840306,  0.34364673],
        [ 0.08206836, -0.20463242,  0.11808557, ..., -0.54706663,
          0.1055802 ,  0.20152366],
        [-0.06292978,  0.05663186, -0.01715738, ..., -0.24839528,
         -0.07256251, -0.03093658]]])

In [None]:
! nbdev_build_docs