In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Automate

/content/drive/MyDrive/Automate


In [None]:
!pip install nmslib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nmslib
  Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 18.2 MB/s 
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 61.9 MB/s 
[?25hInstalling collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [None]:
!pip install wget pathos

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pathos
  Downloading pathos-0.3.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.1 MB/s 
Collecting multiprocess>=0.70.14
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 73.7 MB/s 
[?25hCollecting pox>=0.3.2
  Downloading pox-0.3.2-py3-none-any.whl (29 kB)
Collecting ppft>=1.7.6.6
  Downloading ppft-1.7.6.6-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.7 MB/s 
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9674 sha256=996cb5fdc404872ae7119c0ae7172e8952640721a78a18f084a7a8d1b9560d45
  Stored in directory: /root/.cache/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d8

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import nmslib
from lang_model_utils import load_lm_vocab, Query2Emb
from general_utils import create_nmslib_search_index

input_path = Path('./data/processed_data/')
code2emb_path = Path('./data/code2emb/')
output_path = Path('./data/search')
output_path.mkdir(exist_ok=True)

## Read in Metadata

We will want to organize the data that we will want to display for the search results, which will be:

1. The original code
2. A link to the original code

For convenience, we will collect this data into a pandas dataframe.

In [None]:
# read file of urls
#url_df = pd.read_csv(input_path/'without_docstrings.lineage', header=None, names=['url'])
url_df=pd.read_csv("https://storage.googleapis.com/kubeflow-examples/code_search/data/without_docstrings.lineage",header=None,names=['url'])



# read original code
#code_df = pd.read_json(input_path/'without_docstrings_original_function.json.gz')
code_df = pd.read_json('https://storage.googleapis.com/kubeflow-examples/code_search/data/without_docstrings_original_function.json.gz')
code_df.columns = ['code']

print(code_df.shape)
print(url_df.shape)
# make sure these files have same number of rows
assert code_df.shape[0] == url_df.shape[0]

# collect these two together into a dataframe
ref_df = pd.concat([url_df, code_df], axis = 1).reset_index(drop=True)
ref_df.head()

(4008718, 1)
(4008718, 1)


Unnamed: 0,url,code
0,https://github.com/fnl/libfnl/blob/master/src/...,"def __init__(self, *leafs, **edges):\n self..."
1,https://github.com/fnl/libfnl/blob/master/src/...,"def __eq__(self, other):\n if isinstance(ot..."
2,https://github.com/fnl/libfnl/blob/master/src/...,def __repr__(self):\n return 'Node<leafs={}...
3,https://github.com/fnl/libfnl/blob/master/src/...,@staticmethod\ndef _isCapitalized(token):\n ...
4,https://github.com/fnl/libfnl/blob/master/src/...,"@staticmethod\ndef _isCapitalizeD(last, token)..."


In [None]:
print(ref_df.shape)

(464558, 2)


## Create Search Index For Vectorized Code

First read in the vectorized code

In [None]:
nodoc_vecs = np.load(code2emb_path/'nodoc_vecs.npy')
#nodoc_vecs=nodoc_vecs[:464558,:]
assert nodoc_vecs.shape[0] == ref_df.shape[0]

In [None]:
print(nodoc_vecs.shape)

(464558, 400)


In [None]:
%%time
search_index = create_nmslib_search_index(nodoc_vecs)
search_index.saveIndex('./data/search/search_index.nmslib')

CPU times: user 35min 25s, sys: 4.92 s, total: 35min 30s
Wall time: 18min 29s


# Create A Minimal Search Engine

In [None]:
lang_model = torch.load('./data/lang_model/lang_model_cpu_v2.torch', 
                        map_location=lambda storage, loc: storage)

vocab = load_lm_vocab('./data/lang_model/vocab_v2.cls')
q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)

search_index = nmslib.init(method='hnsw', space='cosinesimil')
search_index.loadIndex('./data/search/search_index.nmslib')



`Query2Emb` is a helper class that will vectorize sentences using the language model trained in Part 3.  

In this case, we call the method `emb_mean` because we are taking the mean over the time steps of the hidden states in order to construct a sentence embedding for the query supplied by the user.  

In [None]:
test = q2emb.emb_mean('Hello World!  This is a test.')
test.shape



(400,)

### Create an object to make the process of showing search results easier

The below object organizes all the pieces together for searching the index and displaying the results with a method call.  

In [None]:
class search_engine:
    """Organizes all the necessary elements we need to make a search engine."""
    def __init__(self, 
                 nmslib_index, 
                 ref_df, 
                 query2emb_func):
        """
        Parameters
        ==========
        nmslib_index : nmslib object
            This is pre-computed search index.
        ref_df : pandas.DataFrame
            This dataframe contains meta-data for search results, 
            must contain the columns 'code' and 'url'.
        query2emb_func : callable
            This is a function that takes as input a string and returns a vector
            that is in the same vector space as what is loaded into the search index.

        """
        assert 'url' in ref_df.columns
        assert 'code' in ref_df.columns
        
        self.search_index = nmslib_index
        self.ref_df = ref_df
        self.query2emb_func = query2emb_func
    
    def search(self, str_search, k=2):
        """
        Prints the code that are the nearest neighbors (by cosine distance)
        to the search query.
        
        Parameters
        ==========
        str_search : str
            a search query.  Ex: "read data into pandas dataframe"
        k : int
            the number of nearest neighbors to return.  Defaults to 2.
        
        """
        query = self.query2emb_func(str_search)
        idxs, dists = self.search_index.knnQuery(query, k=k)
        
        for idx, dist in zip(idxs, dists):
            code = self.ref_df.iloc[idx].code
            url = self.ref_df.iloc[idx].url
            print(f'cosine dist:{dist:.4f}  url: {url}\n---------------\n')
            print(code)

In [None]:
se = search_engine(nmslib_index=search_index,
                   ref_df=ref_df,
                   query2emb_func=q2emb.emb_mean)

# Run Some Queries Against The Index!!

Now that we have instantiated the search engine, we can use the `search` method to display the results.

**Warning:** some of the displayed links may not work since this is historical data retrieved from a [historical open dataset Google has hosted on BigQuery](https://cloud.google.com/bigquery/public-data/github)

In [None]:
se.search('Django rest framework')



cosine dist:0.4625  url: https://github.com/Khroki/MCEdit-Unified/blob/master/albow/extended_widgets.py#L155
---------------

def calc_width(self):
    widths = [self.font.size(_(c, self.doNotTranslate))[0] for c in self.
        choices] + [self.width]
    if len(widths):
        self.width = max(widths) + self.margin * 2

cosine dist:0.4627  url: https://github.com/ocadotechnology/aimmo/blob/master/aimmo-game/simulation/game_logic/map_updaters.py#L24
---------------

def update(self, world_map, context):
    for cell in world_map.score_cells():
        if random.random() < world_map.settings['SCORE_DESPAWN_CHANCE']:
            cell.generates_score = False
    new_num_score_locations = len(list(world_map.score_cells()))
    target_num_score_locations = int(math.ceil(context.num_avatars *
        world_map.settings['TARGET_NUM_SCORE_LOCATIONS_PER_AVATAR']))
    num_score_locations_to_add = (target_num_score_locations -
        new_num_score_locations)
    locations = world_map._spawn_

# Use Custom Ipython Magic Function To Create A Fake Search Box

You don't know how to build a website?  No problem!  You can still impress your friends by using a [custom magic function](https://ipython.org/ipython-doc/3/config/custommagics.html) to allow you to do a live demonstration in a Jupyter notebook.  This is what I did when I first created this prototype!

In [None]:
from IPython.core.magic import (register_line_magic, register_cell_magic,
                                register_line_cell_magic)
@register_cell_magic
def search(line, cell):
    return se.search(cell)

### Live Semantic Search of Code (Searching Holdout Set Only)

In [None]:
%%search
binary search



cosine dist:0.4469  url: https://github.com/openstack/searchlight/blob/master/searchlight/common/wsme_utils.py#L62
---------------

@classmethod
def get_mandatory_attrs(cls):
    return [attr.name for attr in cls._wsme_attributes if attr.mandatory]

cosine dist:0.4504  url: https://github.com/openstack/searchlight/blob/master/searchlight/common/wsme_utils.py#L24
---------------

def to_dict(self):
    my_dict = {}
    for attribute in self._wsme_attributes:
        value = getattr(self, attribute.name)
        if value is not wsme_types.Unset:
            my_dict.update({attribute.name: value})
    return my_dict

