# Perform Doc2Vec

# Read data

This notebook trains a doc2vec model on the entire notebook dataset. 

# Import modules

In [5]:
import pandas as pd
import numpy as np
import os
import gensim
from gensim.models.doc2vec import TaggedDocument

# Read data

In [6]:
df = pd.read_json("../data/all-notebooks-tokenized.json", orient="index")
df['filename'] = df['filename'].astype(str)

In [7]:
df.columns

Index(['cell_type', 'source', 'filename', 'competition', 'tokenized_source'], dtype='object')

In [16]:
df.head()

Unnamed: 0,cell_type,source,filename,competition,tokenized_source
0,code,from subprocess import check_output\n import n...,1806927,favorita-grocery-sales-forecasting,"[from, subprocess, import, check_output, [NEWL..."
1,code,holiday = pd.read_csv('../input/holidays_event...,1806927,favorita-grocery-sales-forecasting,"[holiday, =, pd, ., read_csv, (, '../input/hol..."
2,code,# converting date into datetime format\n holid...,1806927,favorita-grocery-sales-forecasting,"[# converting date into datetime format, [NEWL..."
3,code,# all bridge-type has puente (puente actually ...,1806927,favorita-grocery-sales-forecasting,[# all bridge-type has puente (puente actually...
4,code,"# for Regional and Local holidays, \n # name l...",1806927,favorita-grocery-sales-forecasting,"[# for Regional and Local holidays, , [NEWLINE..."


### Ignore markdown, only consider code for now

In [9]:
subdf = df
subdf = df[df.cell_type == "code"] 

### Calculate the line number for each notebook

In [18]:
subdf['cell_num']=subdf.groupby(['filename']).cumcount()+1
subdf['filename_with_cellnum'] = subdf['filename'] + "_" + subdf['cell_num'].astype(str)

# Convert list of strings to string
subdf['tokenized_source_str'] = subdf['tokenized_source'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['cell_num']=subdf.groupby(['filename']).cumcount()+1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['filename_with_cellnum'] = subdf['filename'] + "_" + subdf['cell_num'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['tokenized_source_str'] = subdf['tokenized_source']

In [19]:
subdf.head()

Unnamed: 0,cell_type,source,filename,competition,tokenized_source,cell_num,filename_with_cellnum,tokenized_source_str
0,code,from subprocess import check_output\n import n...,1806927,favorita-grocery-sales-forecasting,"[from, subprocess, import, check_output, [NEWL...",1,1806927_1,"['from', 'subprocess', 'import', 'check_output..."
1,code,holiday = pd.read_csv('../input/holidays_event...,1806927,favorita-grocery-sales-forecasting,"[holiday, =, pd, ., read_csv, (, '../input/hol...",2,1806927_2,"['holiday', '=', 'pd', '.', 'read_csv', '(', ""..."
2,code,# converting date into datetime format\n holid...,1806927,favorita-grocery-sales-forecasting,"[# converting date into datetime format, [NEWL...",3,1806927_3,"['# converting date into datetime format', '[N..."
3,code,# all bridge-type has puente (puente actually ...,1806927,favorita-grocery-sales-forecasting,[# all bridge-type has puente (puente actually...,4,1806927_4,['# all bridge-type has puente (puente actuall...
4,code,"# for Regional and Local holidays, \n # name l...",1806927,favorita-grocery-sales-forecasting,"[# for Regional and Local holidays, , [NEWLINE...",5,1806927_5,"['# for Regional and Local holidays, ', '[NEWL..."


# Preprocessing
## Generate input data from raw source

In [12]:
from gensim.utils import simple_preprocess

class MyDataframeCorpus(object):
    def __init__(self, source_df, text_col, tag_col):
        self.source_df = source_df
        self.text_col = text_col
        self.tag_col = tag_col

    def __iter__(self):
        for i, row in self.source_df.iterrows():
            yield TaggedDocument(words=simple_preprocess(row[self.text_col]), 
                                 tags=[row[self.tag_col]])


## Train the model

In [24]:
train_corpus = MyDataframeCorpus(subdf, 'tokenized_source_str', 'filename_with_cellnum')

In [25]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=768,  epochs=40)

In [26]:
model.build_vocab(train_corpus)

In [27]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Test inference

In [12]:
testinput = "import numpy as np"
vector = model.infer_vector(testinput.split(" "))
# print(vector)

In [17]:
sims = model.docvecs.most_similar([vector])
print(sims)

for result in sims[0:3]:
    display(df[df.filename_with_cellnum == result[0]]['source'])

[('10506138_2', 0.8787171840667725), ('31005248_62', 0.876400887966156), ('11065888_1', 0.8746524453163147), ('4384070_7', 0.8730512857437134), ('32082014_34', 0.872079610824585), ('2604429_7', 0.8716046214103699), ('236052_1', 0.8713623285293579), ('38355971_1', 0.8710976839065552), ('22398214_10', 0.870916485786438), ('36992845_6', 0.87025386095047)]


776094    plt.rcParams['figure.figsize'] = (12, 9)
Name: source, dtype: object

923774    from sklearn.ensemble import RandomForestClass...
Name: source, dtype: object

480993    import numpy as np\n import pandas as pd\n imp...
Name: source, dtype: object

# Save the model

In [28]:
model.save("../model/notebook-doc2vec-model-apr24.model")