In [17]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2

In [18]:
from ranking.normalization import normalizer as n
from ranking.util import dataset_paths as dp
from ranking.util import json_lines as jl
from tqdm import tqdm
from tree_sitter import Language, Parser
import pandas as pd

input_file = dp.raw_corpus
cleaned_unique_functions_output  = 'parsed-complete-all-unique-functions.jsonl'
tokenized_output = 'tok-' + cleaned_unique_functions_output
lemmatized_output = 'lem-' + cleaned_unique_functions_output

In [19]:
Language.build_library(
  # Store the library in the `build` directory
  'tree-sitter/build/my-languages.so',

  # Include one or more languages
  [
    'tree-sitter\\bindings\\tree-sitter-haskell'
  ]
)

HS_LANGUAGE = Language('tree-sitter\\build\\my-languages.so', 'haskell')
parser = Parser()
parser.set_language(HS_LANGUAGE)

In [20]:
def contains_sig(text: str) -> bool:
    tree = parser.parse(bytes(text, "utf8"))
    return not tree.root_node.has_error and tree.root_node.children[0].type == 'signature'


In [21]:
def get_sig_type(sig: str, query) -> str:
    tree = parser.parse(bytes(sig, "utf8"))
    assert(tree.root_node.children[0].type == 'signature')
    captures = query.captures(tree.root_node)
    return str(captures[0][0].text, 'UTF-8')


In [22]:
def strip_extra_spaces(text):
    return " ".join(text.split())

def equalize_docItem(docItem):
    no_empty_ctx = "".join(docItem.split('() =>'))
    return strip_extra_spaces(no_empty_ctx)

sign = '($$!) :: () => (i -> r) -> Number r i -> r'
exptectedSign = '($$!) :: (i -> r) -> Number r i -> r'

assert(equalize_docItem(sign) == exptectedSign)
assert(exptectedSign == equalize_docItem(exptectedSign))

sign = 'hoistDiT :: () => (forall x . () => n x -> m x) -> (forall x . () => m x -> n x) -> DiT level path msg m a -> DiT level path msg n a'
exptectedSign = 'hoistDiT :: (forall x . n x -> m x) -> (forall x . m x -> n x) -> DiT level path msg m a -> DiT level path msg n a'

assert(equalize_docItem(sign) == exptectedSign)
assert(exptectedSign == equalize_docItem(exptectedSign))

In [23]:
# Adds a storageId that acts a group id to each function
def group_unique_functions(df: pd.DataFrame) -> pd.DataFrame:
    df['equalizedDocItem'] = df.apply(
        lambda row: equalize_docItem(row['docItem']), axis=1)
    df['docContentLen'] = df['docContent'].str.len()
    df.sort_values('docContentLen', ascending=False, inplace=True) # sort to have the longest documentation at the top position in each group
    groups = df.groupby(['equalizedDocItem'])

    df['storageId'] = groups.ngroup()
    df['docContent'] = groups['docContent'].transform('first')
    df['docItem'] = groups['docItem'].transform('first')
    df['docType'] = groups['docType'].transform('first')
    return df[['docId', 'storageId', 'docContent', 'docItem', 'docType', 'docPackage']]


In [31]:
def normalize_dataset(df: pd.DataFrame) -> pd.DataFrame:
    groups = df.groupby('storageId')
    tqdm.pandas(desc='Lemmatizing + Stop Word Removal')
    df['docContent'] = groups['docContent'].transform('first').progress_apply(lambda content: n.normalize(content, stop=n.get_wn_stopwords()))
    return df

def only_tokenize_dataset(df: pd.DataFrame) -> pd.DataFrame:
    groups = df.groupby('storageId')
    tqdm.pandas(desc='Tokenization Only')
    df['docContent'] = groups['docContent'].transform('first').progress_apply(lambda content: n.normalize(content, lambda text: text))
    return df


In [28]:
df = jl.read_jsonl(input_file)
tqdm.pandas(desc='Cleaning')
df['docContent'] = df['docContent'].progress_apply(n.clean)

tqdm.pandas(desc='Extracting signatures')
df['is_signature'] = df['docItem'].progress_apply(contains_sig)
df = df[df['is_signature'] == True] # ignore all items that are no signatures or contain parsing errors

tqdm.pandas(desc='Extracting types of signatures')
query = HS_LANGUAGE.query('(signature _ type: _ _ @type)')
df['docType'] = df['docItem'].progress_apply(lambda item: f':: {get_sig_type(item, query)}')
df = group_unique_functions(df).sort_values('storageId')
jl.to_jsonl(df, cleaned_unique_functions_output)


Cleaning: 100%|██████████| 805414/805414 [00:21<00:00, 36704.80it/s]
Extracting signatures: 100%|██████████| 805414/805414 [00:31<00:00, 25519.70it/s]
Extracting types of signatures: 100%|██████████| 470571/470571 [00:29<00:00, 16214.02it/s]


In [9]:
df_tok = only_tokenize_dataset(df.copy())
df_lem = normalize_dataset(df.copy())

jl.to_jsonl(df_tok, tokenized_output)
jl.to_jsonl(df_lem, lemmatized_output)