# Cross Model Embeddings Training Data Generation

A version of this file that integrates cloud storage + GPU processing is available at: https://colab.research.google.com/drive/16YUuIfv4eZ9W0fc5NxJEDgyiaHwakmMi?usp=sharing

### Load each model: *all-mpnet-base-v2* ("MPNET") and *paraphrase-MiniLM-L6-v2* ("SBERT")

In [89]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

sentences = ["This is an example sentence"]
mpnetModel = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
mpnetEmbeddings = mpnetModel.encode(sentences)
print('MPNET Shape: ',mpnetEmbeddings.shape)

MPNET Shape:  (1, 768)


In [90]:
sbertModel = SentenceTransformer('paraphrase-MiniLM-L6-v2')
sbertEmbeddings = sbertModel.encode(sentences)
print('SBERT Shape: ',sbertEmbeddings.shape)

SBERT Shape:  (1, 384)


### Original data containing large repository of strings and their ada02 embeddings

In [91]:
df = pd.read_csv('./data/oawikishort.csv')
df['content_vector'] = df['content_vector'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
df.head()

Unnamed: 0,text,content_vector
0,Adobe Illustrator is a computer program for ma...,"[-0.014151964336633682, -0.026334520429372787,..."
1,Andouille is a type of pork sausage. It is spi...,"[0.0009071908425539732, 0.012005319818854332, ..."
2,"In mathematics, arithmetic is the basic study ...","[0.01396116428077221, 0.0296261515468359, 0.02..."
3,"Aquaculture is the farming of fish, shrimp, ab...","[0.029741521924734116, -0.0063290782272815704,..."
4,An Abrahamic Religion is a religion whose foll...,"[-0.008373422548174858, -0.0059760394506156445..."


### Re-Embedding each of the strings in the new models, load them into a dataframe of the same format as the original

In [92]:
mpnetDf = pd.DataFrame(columns=['text', 'content_vector'])
for item in df['text'][0:1000]:
    mpnetDf.loc[len(mpnetDf)]=[item, mpnetModel.encode(item)] 
mpnetDf.head()

Unnamed: 0,text,content_vector
0,Adobe Illustrator is a computer program for ma...,"[0.008696059, -0.06535278, -0.02939201, 0.0274..."
1,Andouille is a type of pork sausage. It is spi...,"[-0.0032255095, -0.0033803084, 0.031124098, -0..."
2,"In mathematics, arithmetic is the basic study ...","[-0.019254666, 0.038105287, -0.0052210256, -0...."
3,"Aquaculture is the farming of fish, shrimp, ab...","[-0.012030668, 0.029283559, -0.04656314, -0.03..."
4,An Abrahamic Religion is a religion whose foll...,"[-0.009044365, 0.0052855993, -0.0021596916, -0..."


In [93]:
sbertDf = pd.DataFrame(columns=['text', 'content_vector'])
for item in df['text'][0:1000]:
    sbertDf.loc[len(sbertDf)]=[item, sbertModel.encode(item)] 
sbertDf.head()

Unnamed: 0,text,content_vector
0,Adobe Illustrator is a computer program for ma...,"[-0.09872252, -0.098462135, -0.09605899, 0.108..."
1,Andouille is a type of pork sausage. It is spi...,"[0.082766406, 0.18600921, -0.15227738, -0.0496..."
2,"In mathematics, arithmetic is the basic study ...","[-0.13644145, 0.025827754, -0.17222969, -0.175..."
3,"Aquaculture is the farming of fish, shrimp, ab...","[0.12441488, -0.17825788, 0.03768404, -0.01909..."
4,An Abrahamic Religion is a religion whose foll...,"[0.19308558, 0.44453397, -0.197952, 0.13355273..."


In [94]:
print("Pre-normalization:")
print([sum(x**2) for x in sbertDf.head()['content_vector']]) 
print([sum(x**2) for x in mpnetDf.head()['content_vector']]) 

#Normalize the vectors. I think MPNET is automatically normalized, but just for good measure.
sbertDf['content_vector'] = sbertDf['content_vector'].apply(lambda x: x/np.linalg.norm(x))
mpnetDf['content_vector'] = mpnetDf['content_vector'].apply(lambda x: x/np.linalg.norm(x))

print("\nPost-normalization:")
print([sum(x**2) for x in sbertDf.head()['content_vector']]) 
print([sum(x**2) for x in mpnetDf.head()['content_vector']])

Pre-normalization:
[23.709817233295155, 20.227527645884663, 19.912462634983058, 22.73885046923791, 20.649800833485827]
[0.9999999850087993, 1.0000000372329778, 0.9999998285552794, 1.000000099209858, 0.9999997983312348]

Post-normalization:
[1.000000024277126, 1.0000000314560813, 0.9999999167779237, 1.0000000144134287, 1.000000001586429]
[0.9999999850087993, 1.0000000372329778, 1.0000000516989802, 1.000000099209858, 1.0000000308881525]


### Save each df to a csv that looks like the original

In [95]:
#save sbertDf and mpnetDf to csv
sbertDf.to_csv('./data/sbertDf.csv', index=False)
mpnetDf.to_csv('./data/mpnetDf.csv', index=False)

### Test Load data to ensure data validity

In [96]:
df = pd.read_csv('./data/mpnetDf.csv')
df['content_vector'] = df['content_vector'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))
df.head()

Unnamed: 0,text,content_vector
0,Adobe Illustrator is a computer program for ma...,"[0.00869605877, -0.0653527826, -0.0293920096, ..."
1,Andouille is a type of pork sausage. It is spi...,"[-0.00322550954, -0.00338030839, 0.0311240982,..."
2,"In mathematics, arithmetic is the basic study ...","[-0.0192546677, 0.0381052904, -0.00522102602, ..."
3,"Aquaculture is the farming of fish, shrimp, ab...","[-0.0120306676, 0.0292835589, -0.046563141, -0..."
4,An Abrahamic Religion is a religion whose foll...,"[-0.00904436596, 0.00528559973, -0.00215969188..."
