# Convert the Dataset into Vectors

The goal of this notebook is to generate a vector for each cell for all notebooks in the sliced-notebooks dataset.

Dimensions of vector array: n * sequence count * 300

# Import modules

In [1]:
import pandas as pd
import numpy as np
import os
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
import torch

# Import dataset and doc2vec model

In [2]:
df = pd.read_csv("../data/smalldf-1000notebooks.csv")
model = Doc2Vec.load("../model/notebook-doc2vec-model-apr7-1000notebooks.model")

In [3]:
df = df[df.cell_type == "code"]
df['cell_num'] = df.groupby(['competition','filename']).cumcount()+1

In [4]:
len(df)

22303

# Group the dataset by notebook and generate doc2vec vectors

In [5]:
df_test = df
df.columns

Index(['Unnamed: 0', 'cell_type', 'source', 'filename', 'competition',
       'cell_num', 'filename_with_cellnum'],
      dtype='object')

In [6]:
allVectors = []
allVectorsFilenames = []
for i, notebook in df_test.groupby("filename"):
    
    vectorSeq = []
    vectorNameSeq = []
    # vectorSeq is a list of doc2vec vectors corresponding to [Cell0, Cell1, .... Celln]
    # each vectorSeq list corresponds to a single notebook
    for j, row in notebook.iterrows():
        
        competition = row[3]
        cell_num = row[4]
        kernel_id = row[2]
        
        source = row[1]
        vector = model.infer_vector(source.split(" "))
        vectorSeq.append(vector)
        vectorNameSeq.append(notebook.iloc[0]['competition'] + "/" + notebook.iloc[0]['filename'].astype(str) + "_" + str(cell_num))
    allVectors.append(vectorSeq)
    allVectorsFilenames.append(vectorNameSeq)
        

## Convert from lists of arrays to array of arrays

In [10]:
for i in range(0,A.shape[0]):
    A[i] = np.asarray(A[i])

NameError: name 'A' is not defined

In [11]:
print(len(allVectors))

38086


# Save the arrays

In [7]:
arr = np.array(allVectors,dtype=object)
arrNames = np.array(allVectorsFilenames, dtype=object)

In [8]:
arrNames[8]

['homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conversion',
 'homesite-quote-conversion/153081_homesite-quote-conve

In [9]:
np.save("../data/notebooks-sliced-doc2vec-vectors-apr7-small.npy", arr)
np.save("../data/notebooks-sliced-doc2vec-vectors-filenames-apr7-small.npy", arrNames)