In [1]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2

In [2]:
from ranking.util import json_lines as jl
from ranking.util import dataset_paths as dp
from ranking.normalization import normalizer as n
import pandas as pd

input_file = ''
cleaned_unique_functions_output  = 'complete-all-unique-functions.jsonl'
tokenized_output = 'tok-' + cleaned_unique_functions_output
lemmatized_output = 'lem-' + cleaned_unique_functions_output

In [3]:
def strip_extra_spaces(text):
    return " ".join(text.split())

def equalize_docItem(docItem):
    no_empty_ctx = "".join(docItem.split('() =>'))
    return strip_extra_spaces(no_empty_ctx)

sign = '($$!) :: () => (i -> r) -> Number r i -> r'
exptectedSign = '($$!) :: (i -> r) -> Number r i -> r'

assert(equalize_docItem(sign) == exptectedSign)
assert(exptectedSign == equalize_docItem(exptectedSign))

In [4]:
# Adds a storageId that acts a group id to each function
def group_unique_functions(df: pd.DataFrame) -> pd.DataFrame:
    df['equalizedDocItem'] = df.apply(
        lambda row: equalize_docItem(row['docItem']), axis=1)
    df['docContentLen'] = df['docContent'].str.len()
    df.sort_values('docContentLen', ascending=False, inplace=True) # sort to have the longest documentation at the top position in each group
    groups = df.groupby(['equalizedDocItem'])

    df['storageId'] = groups.ngroup()
    df['docContent'] = groups['docContent'].transform('first')
    df['docItem'] = groups['docItem'].transform('first')
    df['docType'] = groups['docType'].transform('first')
    return df[['docId', 'storageId', 'docContent', 'docItem', 'docType', 'docPackage']]


In [5]:
def normalize_dataset(df: pd.DataFrame) -> pd.DataFrame:
    groups = df.groupby('storageId')
    df['docContent'] = groups['docContent'].transform('first').apply(lambda content: n.normalize(content, stop=n.get_wn_stopwords()))
    return df

def only_tokenize_dataset(df: pd.DataFrame) -> pd.DataFrame:
    groups = df.groupby('storageId')
    df['docContent'] = groups['docContent'].transform('first').apply(lambda content: n.normalize(content, lambda text: text))
    return df


In [6]:
df = jl.read_jsonl(input_file)
df['docContent'] = df['docContent'].apply(n.clean)
df = df[df['docItem'] != '']  # ignore all items that are no functions
df = group_unique_functions(df).sort_values('storageId')
jl.to_jsonl(df, cleaned_unique_functions_output)


In [9]:
df_tok = only_tokenize_dataset(df.copy())
df_lem = normalize_dataset(df.copy())

jl.to_jsonl(df_tok, tokenized_output)
jl.to_jsonl(df_lem, lemmatized_output)