In [5]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2


In [2]:
from ranking.util import json_lines as jl
from ranking.util import dataset_paths as dp
from ranking.normalization import normalizer as n
import pandas as pd


In [4]:
raw_df = jl.read_jsonl(dp.unique_functions_corpus)
raw_df = raw_df[raw_df['docItem'] != '']  # ignore all items that are no functions
unique_types_group = raw_df.groupby('docType')
unique_types_count = unique_types_group.ngroups
print(unique_types_count)

248714


KeyboardInterrupt: 

In [10]:
unique_functions = jl.read_jsonl(dp.unique_functions_corpus)
unique_functions_group = unique_functions.groupby('storageId')
unique_functions_count = unique_functions_group.ngroups


In [11]:
tok_unique_functions = jl.read_jsonl(dp.tokenized_unique_functions_corpus)
tok_unique_functions = tok_unique_functions.groupby('storageId').first()

lem_unique_functions = jl.read_jsonl(dp.lemmatized_unique_functions_corpus)
lem_unique_functions = lem_unique_functions.groupby('storageId').first()

tok_unique_functions_doc_len = tok_unique_functions['docContent'].str.split().str.len()
lem_unique_functions_doc_len = lem_unique_functions['docContent'].str.split().str.len()

# Total
tok_unique_functions_doc_len_sum = tok_unique_functions_doc_len.sum()
lem_unique_functions_doc_len_sum = lem_unique_functions_doc_len.sum()

unique_functions_stat = pd.concat([tok_unique_functions_doc_len.describe().rename('tokenized'), lem_unique_functions_doc_len.describe().rename('lemmatized')], axis=1)
print(unique_functions_stat)
print('total tokenized words:', tok_unique_functions_doc_len_sum)
print('total lemmatized words:', lem_unique_functions_doc_len_sum)

           tokenized     lemmatized
count  406295.000000  406295.000000
mean       10.392424       5.973711
std        29.691560      17.078435
min         0.000000       0.000000
25%         0.000000       0.000000
50%         2.000000       2.000000
75%        12.000000       7.000000
max      3160.000000    1936.000000
total tokenized words: 4222390
total lemmatized words: 2427089


In [12]:
from collections import Counter

tok_fastText_corpus = pd.read_csv(dp.tokenized_unique_sentences_corpus, header=None, converters={0: str}).squeeze('columns')
lem_fastText_corpus = pd.read_csv(dp.lemmatized_unique_sentences_corpus, header=None, converters={0: str}).squeeze('columns')
total_tok_corpus = tok_fastText_corpus.str.split().str.len().sum()
total_lem_corpus = lem_fastText_corpus.str.split().str.len().sum()

unique_words_tok_res = Counter()
unique_words_lem_res = Counter()
unique_words_tok = tok_fastText_corpus.str.split().apply(unique_words_tok_res.update)
unique_words_lem = lem_fastText_corpus.str.split().apply(unique_words_lem_res.update)
print('unique words tokenized', len(unique_words_tok_res))
print('unique words lemmatized', len(unique_words_lem_res))

print('total tokenized words:', total_tok_corpus)
print('total lemmatized words:', total_lem_corpus)
# lem_fastText_corpus = pd.read_csv(dp.lemmatized_unique_sentences_corpus)

unique words tokenized 114803
unique words lemmatized 110939
total tokenized words: 5616723
total lemmatized words: 3259828


In [13]:
tok_tfidf_eval_set = jl.read_jsonl(dp.tokenized_tfidf_evaluation_set)['docQuery'].str.split().str[:6].str.len()
tok_query_len_med = tok_tfidf_eval_set.median()
tok_query_len_mean = tok_tfidf_eval_set.mean() 

lem_tfidf_eval_set = jl.read_jsonl(dp.lemmatized_tfidf_evaluation_set)['docQuery'].str.split().str[:6].str.len()
lem_query_len_med = lem_tfidf_eval_set.median()
lem_query_len_mean = lem_tfidf_eval_set.mean() 

print('Tokenized query length median:', tok_query_len_med)
print('Lemmatzied query length median:', lem_query_len_med)

print('Tokenized query length mean:', tok_query_len_mean)
print('Lemmatzied query length mean:', lem_query_len_mean)

print('tokenized length', len(tok_tfidf_eval_set.index))
print('lemmatized length', len(lem_tfidf_eval_set.index))

Tokenized query length median: 4.0
Lemmatzied query length median: 3.0
Tokenized query length mean: 3.9126810714574733
Lemmatzied query length mean: 3.7583040871430473
tokenized length 172998
lemmatized length 172957


In [14]:
unique_functions = jl.read_jsonl(dp.unique_functions_corpus).groupby('storageId').first()
tok_manual_eval_set = jl.read_jsonl(dp.manual_evaluation_set)[['storageId','docQuery','sourceLink']]
manual_eval_set_with_items = tok_manual_eval_set.set_index('storageId').join(unique_functions)
tokenize_only = lambda query: n.normalize(query, stem=lambda x: x)
pre_process = lambda query: n.normalize(query, stop=n.get_wn_stopwords())
manual_eval_set_with_items = manual_eval_set_with_items[['docQuery', 'docItem', 'sourceLink']]
manual_eval_set_with_items['docQueryNorm'] = manual_eval_set_with_items['docQuery'].apply(tokenize_only)
manual_eval_set_with_items['docQueryFullyPreProc'] = manual_eval_set_with_items['docQueryNorm'].apply(pre_process)

tok_man_query_len = manual_eval_set_with_items['docQueryNorm'].str.split().str.len()
lem_man_query_len = manual_eval_set_with_items['docQueryFullyPreProc'].str.split().str.len()

print('Tokenized query length median:', tok_man_query_len.median())
print('Lemmatzied query length median:', lem_man_query_len.median())

print('Tokenized query length mean:', tok_man_query_len.mean())
print('Lemmatzied query length mean:', lem_man_query_len.mean())

print('tokenized length', len(tok_tfidf_eval_set.index))
print('lemmatized length', len(lem_tfidf_eval_set.index))


Tokenized query length median: 9.0
Lemmatzied query length median: 5.0
Tokenized query length mean: 9.52
Lemmatzied query length mean: 4.84
tokenized length 172998
lemmatized length 172957
