## **Installing modules and importing libraries**

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install -q huggingface_hub
!pip install -q sentencepiece


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.2/174.2 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import sentencepiece

## **importing english-spanish dataset**

In [None]:


df = pd.read_csv('/content/english-spanish-dataset.csv')


In [None]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,english,spanish
0,0,go,ve
1,1,go,vete
2,2,go,vaya
3,3,go,vayase
4,4,hi,hola
...,...,...,...
95,95,get out,salte
96,96,get out,sal
97,97,get out,sali
98,98,get out,salid


**Radomizing and selecting dataset rows**

In [None]:
# Randomizing the entire dataset
randomized_data = df.sample(frac=1).reset_index(drop=True)

# Dropping the index column and selecting rows 1000 to 1200 from the randomized dataset
selected_randomized_data = randomized_data.drop(columns=['Unnamed: 0']).iloc[1000:1201]

# Display the first few rows of the selected randomized data to confirm the changes
df_new = selected_randomized_data.reset_index(drop = True)


In [None]:
print(df_new.info())
df_new.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  201 non-null    object
 1   spanish  201 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB
None


Unnamed: 0,english,spanish
0,shes an alcoholic,ella es una alcoholica
1,why did you invite tom to dinner,por que invitaron a tom a la cena
2,consider yourselves lucky,considerense afortunados
3,could i borrow a cup of sugar,me podrias prestar una taza de azucar
4,tom put down his pen,tom bajo su pluma


### **Sentence translation using MarianMT model**

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Check if a GPU is available and use it (recommended for faster processing)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the MarianMT model and tokenizer for English to Spanish translation
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)



In [None]:
def translate_and_get_embeddings(sentence):
    # Tokenize the English sentence
    model_inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation using the model
    translated_output = model.generate(**model_inputs, output_hidden_states=True, return_dict_in_generate=True)

    # Decode the translation
    translated = tokenizer.batch_decode(translated_output.sequences, skip_special_tokens=True)[0]

    # Extract hidden states (embeddings) from the last layer of the decoder
    # Here, we take the hidden states associated with the output sequences
    hidden_states = translated_output.decoder_hidden_states

    last_layer_hidden_states = hidden_states[-1]  # Last layer's hidden states

    # Reshape the embeddings to be of shape [num_tokens, embedding_size]
    # Note: We take the first item in the batch
    embeddings = last_layer_hidden_states[0]

    return translated, embeddings

# # Example usage
# sample_sentence = "get out of my house. You are a criminal"
# translation, embeddings = translate_and_get_embeddings(sample_sentence)
# print("Translation:", translation)
# print("Embeddings shape:", embeddings.shape)


In [None]:

# Assuming 'selected_randomized_data' is the DataFrame you have from the previous step

# Initialize an empty list to store the new translations
new_translations = []

for index, row in df_new.iterrows():
    english_sentence = row['english']

    # Translate the sentence using your function
    # This function should return the translated text and embeddings
    spanish_translation, _ = translate_and_get_embeddings(english_sentence)

    # Append the new translation to the list
    new_translations.append(spanish_translation)

# Create a new DataFrame with the English, original Spanish, and new Spanish columns
new_data = pd.DataFrame({
    'English': df_new['english'],
    'Original Spanish': df_new['spanish'],
    'Translated Spanish': new_translations
})

# Example usage to display the new DataFrame
print(new_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   English             201 non-null    object
 1   Original Spanish    201 non-null    object
 2   Translated Spanish  201 non-null    object
dtypes: object(3)
memory usage: 4.8+ KB
None


 **Translated sentence dataframe**

In [None]:
new_data.head()

Unnamed: 0,English,Original Spanish,Translated Spanish
0,youll be in charge of the women working in thi...,usted estara al cargo de la mujer que trabaja ...,Estarás a cargo de las mujeres que trabajan en...
1,now try to sleep,ahora intenta dormir,Ahora trata de dormir.
2,i didnt know you had children,no sabia que tenias ninos,No sabía que tenías hijos.
3,make it happen,haz que suceda,Hacer que suceda.
4,i want to go to the movies today,quiero ir al cine hoy,Quiero ir al cine hoy.


### **Getting Spanish embeddings for the sentence translation using MarianMT**

In [None]:
def get_spanish_embeddings_diff(sentence):
    # Tokenize the Spanish sentence
    # Here, we use the same tokenizer but apply it to the Spanish sentence
    model_inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)

    # Forward pass through the encoder only
    # We are not generating a translation, just getting embeddings
    encoder_outputs = model.get_encoder()(input_ids=model_inputs['input_ids'], attention_mask=model_inputs['attention_mask'])

    # Extract hidden states (embeddings) from the last layer of the encoder
    last_layer_hidden_states = encoder_outputs.last_hidden_state

    # Reshape the embeddings to be of shape [num_tokens, embedding_size]
    # Note: We take the first item in the batch
    embeddings_diff = last_layer_hidden_states[0]

    return embeddings_diff

# # Example usage
# spanish_sentence = "Vete de mi casa, eres un criminal"
# embeddings_diff = get_spanish_embeddings_diff(spanish_sentence)
# print("Embeddings_diff shape:", embeddings_diff.shape)


In [None]:
# Assuming 'new_data' is the DataFrame with your translations
data = new_data.head(20)  # Selecting the top 10 rows

# Initialize an empty list to store the embeddings
embeddings_list = []

for index, row in data.iterrows():
    translated_spanish_sentence = row['Translated Spanish']

    # Get the embeddings for the translated Spanish sentence
    embeddings_diff = get_spanish_embeddings_diff(translated_spanish_sentence)

    # Append the embeddings to the list
    embeddings_list.append(embeddings_diff)

# You can now process or analyze these embeddings as needed
# Example: Print the shape of the embeddings for the first sentence
print("Embeddings_diff shape for the first sentence:", embeddings_list[0].shape)


Embeddings_diff shape for the first sentence: torch.Size([9, 512])


### **Sentence translation using gpt-3**

In [None]:
!pip install openai==0.28
import openai

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.28.0


In [None]:
def translate_with_gpt3(text, source_language="English", target_language="Spanish"):
    openai.api_key = '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'

    response = openai.Completion.create(
        engine="text-davinci-002",  # or another GPT-3 model
        prompt=f"Translate this sentence from {source_language} to {target_language}: {text}",
        max_tokens=60
    )

    return response.choices[0].text.strip()

# Load the dataset

# Create a new column for GPT-3 translations
new_data['translated gpt-3'] = new_data['English'].apply(translate_with_gpt3)

In [None]:
new_data.head()

Unnamed: 0,English,Original Spanish,Translated Spanish,translated gpt-3
0,shes an alcoholic,ella es una alcoholica,Es una alcohólica.,Ella es una alcohólica.
1,why did you invite tom to dinner,por que invitaron a tom a la cena,¿Por qué invitaste a Tom a cenar?,¿Por qué invitaste a Tom a cenar?
2,consider yourselves lucky,considerense afortunados,Considérense afortunados.,Considérase afortunados
3,could i borrow a cup of sugar,me podrias prestar una taza de azucar,¿Me prestas una taza de azúcar?,¿Podría tomar prestada una taza de azúcar?
4,tom put down his pen,tom bajo su pluma,Tom dejó su pluma.,Tom puso su pluma.


### **Spanish embeddings for the sentence translation(from openAI) using MarianMT**

In [None]:
# Assuming 'new_data' is the DataFrame with your translations
data = new_data.head(20)  # Selecting the top 10 rows

# Initialize an empty list to store the embeddings
embeddings_list_gpt = []

for index, row in data.iterrows():
    translated_spanish_sentence_gpt = row['translated gpt-3']

    # Get the embeddings for the translated Spanish sentence
    embeddings_diff_gpt = get_spanish_embeddings_diff(translated_spanish_sentence_gpt)

    # Append the embeddings to the list
    embeddings_list_gpt.append(embeddings_diff_gpt)

# You can now process or analyze these embeddings as needed
# Example: Print the shape of the embeddings for the first sentence
print("Embeddings_diff_gpt shape for the first sentence:", embeddings_list_gpt[0].shape)



Embeddings_diff_gpt shape for the first sentence: torch.Size([11, 512])


**Cosine similiarity score computation**

In [None]:
import pandas as pd
import torch
from torch.nn.functional import cosine_similarity

# Assuming embeddings_list and embeddings_list_gpt are your lists of tensors
data_for_dataframe = []

for idx, (emb1, emb2) in enumerate(zip(embeddings_list, embeddings_list_gpt)):
    min_size = min(emb1.size(0), emb2.size(0))
    embedding_indices = []
    cosine_similarities = []

    for i in range(min_size):
        similarity = cosine_similarity(emb1[i].unsqueeze(0), emb2[i].unsqueeze(0)).item()
        embedding_indices.append(i)
        cosine_similarities.append(similarity)

    data_for_dataframe.append({'Pair_Index': idx,
                               'Embedding_Indices': tuple(embedding_indices),
                               'Cosine_Similarities': tuple(cosine_similarities)})

# Create a DataFrame
similarity_df = pd.DataFrame(data_for_dataframe)

# Output the DataFrame
similarity_df.head(30)


Unnamed: 0,Pair_Index,Embedding_Indices,Cosine_Similarities
0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8)","(0.3511978089809418, 0.47589144110679626, 0.22..."
1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","(0.9999999403953552, 0.9999999403953552, 1.000..."
2,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8)","(0.9642438292503357, 0.9800065755844116, 0.909..."
3,3,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","(0.9784915447235107, 0.9784930348396301, 0.277..."
4,4,"(0, 1, 2, 3, 4, 5, 6, 7, 8)","(0.9639159440994263, 0.4244888424873352, 0.386..."
5,5,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","(0.4571623206138611, 0.42301177978515625, 0.24..."
6,6,"(0, 1, 2, 3, 4, 5, 6, 7, 8)","(1.0000001192092896, 1.0, 0.9999998807907104, ..."
7,7,"(0, 1, 2, 3, 4)","(0.5638176202774048, 0.42302021384239197, 0.41..."
8,8,"(0, 1, 2, 3, 4, 5, 6, 7, 8)","(0.2156154364347458, 0.24035999178886414, 0.26..."
9,9,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","(0.975132405757904, 0.9914792776107788, 0.9898..."


In [None]:
import pandas as pd
import torch
from torch.nn.functional import cosine_similarity

# Assuming embeddings_list and embeddings_list_gpt are your lists of tensors
data_for_dataframe = []

for idx, (emb1, emb2) in enumerate(zip(embeddings_list, embeddings_list_gpt)):
    min_size = min(emb1.size(0), emb2.size(0))
    cosine_similarities = []

    for i in range(min_size):
        similarity = cosine_similarity(emb1[i].unsqueeze(0), emb2[i].unsqueeze(0)).item()
        cosine_similarities.append(similarity)

    # Calculate the average similarity for the pair
    average_similarity = sum(cosine_similarities) / len(cosine_similarities)

    data_for_dataframe.append({'Pair_Index': idx,
                               'Cosine_Similarities': tuple(cosine_similarities),
                               'Average': average_similarity})

# Create a DataFrame
similarity_df = pd.DataFrame(data_for_dataframe)

# Output the DataFrame
similarity_df.head(20)


Unnamed: 0,Pair_Index,Cosine_Similarities,Average
0,0,"(0.3511978089809418, 0.47589144110679626, 0.22...",0.303432
1,1,"(0.9999999403953552, 0.9999999403953552, 1.000...",1.0
2,2,"(0.9642438292503357, 0.9800065755844116, 0.909...",0.820022
3,3,"(0.9784915447235107, 0.9784930348396301, 0.277...",0.364687
4,4,"(0.9639159440994263, 0.4244888424873352, 0.386...",0.790216
5,5,"(0.4571623206138611, 0.42301177978515625, 0.24...",0.382234
6,6,"(1.0000001192092896, 1.0, 0.9999998807907104, ...",1.0
7,7,"(0.5638176202774048, 0.42302021384239197, 0.41...",0.389018
8,8,"(0.2156154364347458, 0.24035999178886414, 0.26...",0.206104
9,9,"(0.975132405757904, 0.9914792776107788, 0.9898...",0.516757


In [None]:
len(similarities)

223