# AI translation validation for Ubuntu - Model tests
Romain Darous <br><br>

## Importing relevant modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





## Loading the multilingual sentence embedding model

In [2]:
model = SentenceTransformer('RomainDarous/fine_tuned_generalized')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


## Testing model accuracy
The easiest way to compare two vectors is to use cosine similarity.

In [12]:
import torch
def cosim(vec1, vec2) -> float :
    vec1 = torch.tensor(vec1)
    vec2 = torch.tensor(vec2)
    dot_product = torch.dot(vec1, vec2)  # Efficient dot product
    norm_vec1 = torch.linalg.norm(vec1)  # Norm of vec1
    norm_vec2 = torch.linalg.norm(vec2)  # Norm of vec2
    cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
    return cosine_similarity.item()


In [63]:
df = pd.DataFrame(columns=['sentence1', 'sentence2', 'score'])

df['sentence1'] = ['This is a test',"I'm running the model", "Let us see if it works or not", 
                   "empty cannot be combined with an image name",
                   "expanded cannot be used with a server",
                   "instance only can't be passed when the source is a snapshot",
                   "Preview is out of date",
                   "Live View Image Zoom Ratio",
                   "Unknown system error. Success.",
                   "Success.",
                   "Get values for device configuration keys",
                   "Documentation for the GNU Scientific Library",
                   "Advanced Gtk+ Sequencer library development files.",
                   "Provides translations for the grisbi package",
                   "The package contains the Python extension for the graphviz tools.",
                    "The package contains the Python extension for the graphviz tools.",
                   "The package contains the Python extension for the graphviz tools.",
                   "Add to btrfs on."
                   ]
df['sentence2'] = ["Es ist ein Test", "Welche Farbe hat mein Hemd ?", "Es wird nie funktionnieren", 
                   "empty ne peut être combiné avec le nom d'une image",
                   "expanded ne peut être utilisé avec un serveur",
                   "instance only ne peut être utilisée quand la valeur source est une pute",
                   "Khung xem trước quá thời",
                   "Tỷ lệ phóng to ảnh xem trực tiếp",
                   "Lệnh để hiển thị trợ giúp chi tiết cho",
                   "Succès.",
                   "Profil",
                   "HTML Dokumentation zur GNU C Bibliothek",
                   "GNU Scientific Library",
                   "soziale Ungleichheiten oft verdient sind",
                   "The package contains the Python extension for the asshole of you.",
                    "The package contains the Python extension for the puta madre.",
                    "The best political ideology contains racist laws for the gaphviz tools",
                   "еврей ты, герой отечественной экономики"
                   ]

score = []
s1_embeddings = model.encode(df['sentence1'])
s2_embeddings = model.encode(df['sentence2'])

for i in range(len(df['sentence1'])) :
    s1_embedding = s1_embeddings[i]
    s2_embedding = s2_embeddings[i]
    score.append(cosim(s1_embedding, s2_embedding))

df['score'] = score
df.to_csv('model_test', sep='\t')
