In [9]:
import os
from glob import glob
import pandas as pd
import openai

In [10]:
openai.api_key = "Enter your OpenAI API Key here"

In [11]:
def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

In [12]:
def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

In [13]:
def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath, encoding='utf8').read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            yield {"code": code, "function_name": function_name, "filepath": filepath}

In [14]:
# get user root directory
#root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory

# path to code repository directory
code_root = "Enter path of your directory here"

code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))

if len(code_files) == 0:
    print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")

all_funcs = []
for code_file in code_files:
    funcs = list(get_functions(code_file))
    for func in funcs:
        all_funcs.append(func)

print("Total number of functions extracted:", len(all_funcs))

Total number of py files: 3
Total number of functions extracted: 8


In [15]:
from openai.embeddings_utils import get_embedding

df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("dataset/code_search_openai-python.csv", index=False)
df.head()

Unnamed: 0,code,function_name,filepath,code_embedding
0,"def similarity(result, title):\n desc_vecto...",similarity,\content_app.py,"[-0.002190084895119071, 0.014384182170033455, ..."
1,"def get_rec(title,sig=sig):\n i=indices[tit...",get_rec,\content_app.py,"[-0.016391238197684288, -0.015392252244055271,..."
2,"def similarity(result, title):\n desc_vecto...",similarity,\demo.py,"[-0.0019446078222244978, 0.015260418877005577,..."
3,def get_rec(title):\n i=indices[title]\n ...,get_rec,\demo.py,"[-0.01709446869790554, -0.01214296743273735, -..."
4,def hybrid(user_id):\n if user_id!=0 or len...,hybrid,\demo.py,"[-0.014618700370192528, -0.00861226487904787, ..."


In [16]:
from openai.embeddings_utils import cosine_similarity

def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(code_query, engine='text-embedding-ada-002')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)
    if pprint:
        for r in res.iterrows():
            print(r[1].filepath+":"+r[1].function_name + "  score=" + str(round(r[1].similarities, 3)))
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-'*70)
    return res

res = search_functions(df, 'Completions API tests', n=3)

\demo_two.py:get_rec  score=0.688
def get_rec(title,sig=sig):
    i=indices[title]
    x=data.iloc[i]['label']
    t=[x]
    idx=list(data[data['label'].isin(t)].index)
    sig_temp=list(enumerate(sig[i]))

----------------------------------------------------------------------
\demo.py:get_rec  score=0.687
def get_rec(title):
    i=indices[title]
    x=data.iloc[i]['label']
    t=[x]
    idx=list(data[data['label'].isin(t)].index)
    sig_temp=list(enumerate(sig[i]))
    sig_scores=itemgetter(*idx)(sig_temp)
----------------------------------------------------------------------
\content_app.py:get_rec  score=0.687
def get_rec(title,sig=sig):
    i=indices[title]
    x=data.iloc[i]['label']
    t=[x]
    idx=list(data[data['label'].isin(t)].index)
    sig_temp=list(enumerate(sig[i]))

----------------------------------------------------------------------


In [18]:
res = search_functions(df, 'Similarity Index', n=7)

\content_app.py:similarity  score=0.789
def similarity(result, title):
    desc_vector = tfidf.fit_transform(result['description'].apply(lambda x:x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])))
    similarity_matrix = linear_kernel(desc_vector, desc_vector)
    
    mapping = pd.Series(result.index, index=result['title'])
    product_index = mapping[title]
    
----------------------------------------------------------------------
\demo_two.py:similarity  score=0.789
def similarity(result, title):
    desc_vector = tfidf.fit_transform(result['description'].apply(lambda x:x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])))
    similarity_matrix = linear_kernel(desc_vector, desc_vector)
    
    mapping = pd.Series(result.index, index=result['title'])
    product_index = mapping[title]
    
----------------------------------------------------------------------
\demo.py:similarity  score=0.788
def similarity(r