# AI translation validation for Ubuntu - Model tests
Romain Darous <br><br>

## Importing relevant modules

In [11]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import random

## Loading the multilingual sentence embedding model

In [12]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('RomainDarous/two_epochs_finetuned_additive_generalized_model', device=device)
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): MultiHeadGeneralizedPooling(
    (P): ModuleList(
      (0-7): 8 x Linear(in_features=768, out_features=96, bias=True)
    )
    (W1): ModuleList(
      (0-7): 8 x Linear(in_features=96, out_features=384, bias=True)
    )
    (W2): ModuleList(
      (0-7): 8 x Linear(in_features=384, out_features=96, bias=True)
    )
  )
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

## Testing model accuracy
The easiest way to compare two vectors is to use cosine similarity.

In [13]:
import torch
def cosim(vec1, vec2) -> float :
    vec1 = torch.tensor(vec1)
    vec2 = torch.tensor(vec2)
    dot_product = torch.dot(vec1, vec2)  # Efficient dot product
    norm_vec1 = torch.linalg.norm(vec1)  # Norm of vec1
    norm_vec2 = torch.linalg.norm(vec2)  # Norm of vec2
    cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
    return cosine_similarity.item()


In [14]:
df = pd.DataFrame(columns=['sentence1', 'sentence2', 'score'])

df['sentence1'] = ['Hello', 'This is a test',"I'm running the model", "Let us see if it works or not", 
                   "empty cannot be combined with an image name",
                   "expanded cannot be used with a server",
                   "instance only can't be passed when the source is a snapshot",
                   "Preview is out of date",
                   "Live View Image Zoom Ratio",
                   "Unknown system error. Success.",
                   "Success.",
                   "Get values for device configuration keys",
                   "Documentation for the GNU Scientific Library",
                   "Advanced Gtk+ Sequencer library development files.",
                   "Provides translations for the grisbi package",
                   "The package contains the Python extension for the graphviz tools.",
                    "The package contains the Python extension for the graphviz tools.",
                   "The package contains the Python extension for the graphviz tools.",
                   "Add to btrfs on."
                   ]
df['sentence2'] = ["Bonjour", "Es ist ein Test", "Welche Farbe hat mein Hemd ?", "Es wird nie funktionnieren", 
                   "empty ne peut être combiné avec le nom d'une image",
                   "expanded ne peut être utilisé avec un serveur",
                   "instance only ne peut être utilisée quand la valeur source est une pute",
                   "Khung xem trước quá thời",
                   "Tỷ lệ phóng to ảnh xem trực tiếp",
                   "Lệnh để hiển thị trợ giúp chi tiết cho",
                   "Succès.",
                   "Profil",
                   "HTML Dokumentation zur GNU C Bibliothek",
                   "GNU Scientific Library",
                   "soziale Ungleichheiten oft verdient sind",
                   "The package contains the Python extension for the asshole of you.",
                    "The package contains the Python extension for the puta madre.",
                    "The best political ideology contains racist laws for the gaphviz tools",
                   "еврей ты, герой отечественной экономики"
                   ]

score = []
s1_embeddings = model.encode(df['sentence1'])
s2_embeddings = model.encode(df['sentence2'])

for i in range(len(df['sentence1'])) :
    s1_embedding = s1_embeddings[i]
    s2_embedding = s2_embeddings[i]
    score.append(cosim(s1_embedding, s2_embedding))

df['score'] = score
df.to_csv('model_test', sep='\t')


In [15]:
model.encode("Bonjour")

array([ 5.27609289e-02,  4.56643440e-02, -5.69895767e-02, -2.76274253e-02,
       -9.59386230e-02, -1.53194070e-01, -1.62561610e-02, -3.14026736e-02,
        3.28589790e-02,  3.48992124e-02,  6.07135370e-02,  3.82302664e-02,
        7.09592253e-02,  1.06886618e-01,  1.44894734e-01,  5.07209674e-02,
       -1.12308795e-03, -3.32430229e-02, -2.02447936e-01,  2.85894722e-02,
        2.47205794e-02,  9.79636982e-02,  8.45447090e-03,  2.28733979e-02,
       -8.46868195e-03,  9.07428749e-03,  1.45234158e-02,  8.80251303e-02,
       -8.60891417e-02,  1.39421523e-02,  1.55125037e-01, -1.57805474e-03,
       -1.89363927e-01, -4.67554992e-03,  8.87291133e-02,  8.47654119e-02,
        3.55066992e-02,  3.72422524e-02, -2.99963225e-02, -9.99274701e-02,
        2.45868955e-02, -6.71740919e-02,  5.34551814e-02, -7.07095787e-02,
        3.69791910e-02,  2.66699046e-02,  2.99047492e-02,  2.95992754e-02,
        6.35749847e-02, -7.04592541e-02,  7.69990459e-02,  7.20722741e-03,
        4.37408797e-02,  