# Import Modules

In [1]:
import pandas as pd
import numpy as np
import os
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec

# Load the model and dataframe

In [3]:
model = Doc2Vec.load("notebook-doc2vec-model-apr5.model")
df = pd.read_json("all-notebooks.json", orient="index")
df['filename'] = df['filename'].astype(str)
df['cell_num']=df.groupby(['filename']).cumcount()+1
# Generate the proper naming formats
df['filename_with_cellnum'] = df['filename'] + "_" + df['cell_num'].astype(str)
df = df[df.cell_type == 'code'] # Filter only code cells

# Compare vector similarities

In [25]:
queryA = "import numpy"
queryB = "plt.show()"
queryC = "import pandas"
vectorA = model.infer_vector(queryA.split(" "))
vectorB = model.infer_vector(queryB.split(" "))
vectorC = model.infer_vector(queryC.split(" "))

In [26]:
euclidean_dist1 = sum(((vectorA - vectorB)**2))
euclidean_dist2 = sum(((vectorA - vectorC)**2))

print(euclidean_dist1)
print(euclidean_dist2)

0.9353717947218461
0.43430551836616527


In [28]:
import scipy

cosine_dist1 = scipy.spatial.distance.cosine(vectorA, vectorB)
cosine_dist2 = scipy.spatial.distance.cosine(vectorA, vectorC)
print(cosine_dist1)
print(cosine_dist2)

0.9941924959421158
0.26803433895111084


## Old code for getting similar cells

In [29]:
query = "import numpy as np"
vector = model.infer_vector(query.split(" "))


sims = model.docvecs.most_similar([vector])
# print(sims)
print(sims)
# Lookup top 5 similar 
# Lookup the notebook filename in the dataframe
for result in sims[0:10]:
#     print(result[0])
    print(df[df.filename_with_cellnum == result[0]]['source'])

[('35945043_21', 0.8064147233963013), ('30346108_1', 0.8016058802604675), ('17672315_1', 0.8004535436630249), ('1462393_62', 0.7989287376403809), ('31684603_28', 0.7951673269271851), ('4645182_1', 0.7950820922851562), ('11183439_2', 0.7949899435043335), ('31407581_1', 0.7945380210876465), ('6268183_7', 0.7942268252372742), ('38889698_1', 0.7937500476837158)]
64044    preds = model.predict(X_val)
Name: source, dtype: object
Series([], Name: source, dtype: object)
Series([], Name: source, dtype: object)
Series([], Name: source, dtype: object)
Name: source, dtype: object
Series([], Name: source, dtype: object)
193823    import numpy as np\n import seaborn as sns\n i...
Name: source, dtype: object
642115    import pandas as pd\n import matplotlib.pyplot...
Name: source, dtype: object
1084386    import numpy as np
Name: source, dtype: object
527799    !pip install -q efficientnet
Name: source, dtype: object


`sims` prints out a list of tuples of the most similar notebooks and is of the form `sims = [(filename1, similarity1), (filename2, similarity2), ...]`

We can use the filename to lookup back into the dataset and get the code. 