# Perform Doc2Vec

# Read data

This notebook trains a doc2vec model on the entire notebook dataset. 

# Import modules

In [1]:
import pandas as pd
import numpy as np
import os
import gensim
from gensim.models.doc2vec import TaggedDocument

# Read data

In [3]:
df = pd.read_json("../data/all-notebooks.json", orient="index")
df['filename'] = df['filename'].astype(str)

In [4]:
df.columns

Index(['cell_type', 'source', 'filename', 'competition'], dtype='object')

### Ignore markdown, only consider code for now

In [5]:
subdf = df
subdf = df[df.cell_type == "code"] 

### Calculate the line number for each notebook

In [18]:
subdf['cell_num']=subdf.groupby(['filename']).cumcount()+1
subdf['filename_with_cellnum'] = subdf['filename'] + "_" + subdf['cell_num'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['cell_num']=subdf.groupby(['filename']).cumcount()+1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['filename_with_cellnum'] = subdf['filename'] + "_" + subdf['cell_num'].astype(str)


# Preprocessing
## Generate input data from raw source

In [19]:
from gensim.utils import simple_preprocess

class MyDataframeCorpus(object):
    def __init__(self, source_df, text_col, tag_col):
        self.source_df = source_df
        self.text_col = text_col
        self.tag_col = tag_col

    def __iter__(self):
        for i, row in self.source_df.iterrows():
            yield TaggedDocument(words=simple_preprocess(row[self.text_col]), 
                                 tags=[row[self.tag_col]])


## Train the model

In [20]:
train_corpus = MyDataframeCorpus(subdf, 'source', 'filename_with_cellnum')

In [21]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)

In [22]:
model.build_vocab(train_corpus)

In [23]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Test inference

In [148]:
testinput = "import numpy as np"
vector = model.infer_vector(testinput.split(" "))
# print(vector)

In [149]:
sims = model.docvecs.most_similar([vector])
print(sims)

for result in sims[0:3]:
    display(df[df.filename == result[0]]['source'])

[('1789576', 0.8671780824661255), ('90182', 0.8659797310829163), ('111694', 0.8597860336303711), ('646182', 0.8538728952407837), ('1222899', 0.8537805080413818), ('240013', 0.8515975475311279), ('8573758', 0.8492058515548706), ('111718', 0.8480663299560547), ('728277', 0.8475252389907837), ('1035483', 0.8474348783493042)]


387339    # cdiscount-image-classification-challenge\n \...
387340                                   import numpy as np
Name: source, dtype: object

947002    import numpy as np
947003                      
Name: source, dtype: object

1144075    print("hello world")
Name: source, dtype: object

# Save the model

In [24]:
model.save("../model/notebook-doc2vec-model-mar24.model")