# Convert the Dataset into Vectors

The goal of this notebook is to generate a vector for each cell for all notebooks in the sliced-notebooks dataset.

Dimensions of vector array: n * sequence count * 300

# Import modules

In [7]:
import pandas as pd
import numpy as np
import os
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
import torch
from tokenize_code import tokenize_code

# Import dataset and doc2vec model

In [3]:
df = pd.read_json("../data/all-notebooks-tokenized.json", orient='index')
model = Doc2Vec.load("../model/notebook-doc2vec-model-apr24.model")

In [4]:
df = df[df.cell_type == "code"]
df['cell_num'] = df.groupby(['competition','filename']).cumcount()+1

In [8]:
len(df)
df.columns

Index(['cell_type', 'source', 'filename', 'competition', 'tokenized_source',
       'cell_num'],
      dtype='object')

# Group the dataset by notebook and generate doc2vec vectors

In [6]:
df_test = df
df.columns

Index(['cell_type', 'source', 'filename', 'competition', 'tokenized_source',
       'cell_num'],
      dtype='object')

In [13]:
allVectors = []
allVectorsFilenames = []
for i, notebook in df_test.groupby("filename"):
    
    vectorSeq = []
    vectorNameSeq = []
    # vectorSeq is a list of doc2vec vectors corresponding to [Cell0, Cell1, .... Celln]
    # each vectorSeq list corresponds to a single notebook
    for j, row in notebook.iterrows():
        #print(row)
        competition = row[3]
        cell_num = row[5]
        tokenized_source = row[4]
        kernel_id = row[2]
        
        source = row[1]
        vector = model.infer_vector(tokenized_source)
        vectorSeq.append(vector)
        vectorNameSeq.append(notebook.iloc[0]['competition'] + "/" + notebook.iloc[0]['filename'].astype(str) + "_" + str(cell_num))
    allVectors.append(vectorSeq)
    allVectorsFilenames.append(vectorNameSeq)
        

## Convert from lists of arrays to array of arrays

In [None]:
for i in range(0,A.shape[0]):
    A[i] = np.asarray(A[i])

In [11]:
print(len(allVectors))

38086


# Save the arrays

In [14]:
arr = np.array(allVectors,dtype=object)
arrNames = np.array(allVectorsFilenames, dtype=object)

In [None]:
arrNames[8]

In [None]:
np.save("../data/notebooks-doc2vec-vectors-apr24.npy", arr)
np.save("../data/notebooks-doc2vec-vectors-filenames-apr24.npy", arrNames)