In [2]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from ranking.util import json_lines as jl
from ranking.util import dataset_paths as dp
import pandas as pd
import numpy as np


In [25]:
def calculate_tfidf_for_dataset(corpus) -> pd.DataFrame:
    vectorizer = TfidfVectorizer()
    result = vectorizer.fit_transform(corpus)
    idf_df = pd.DataFrame.sparse.from_spmatrix(result, columns=vectorizer.get_feature_names())
    return idf_df

def lookup_tfidf_weight(df_tfidf: pd.DataFrame, storage_id, word):
    if word in df_tfidf.columns:
        return df_tfidf.at[storage_id, word]
    else:
        return 0

def apply_tfidf_weights_to_doc(tfidf: pd.DataFrame, storage_id, doc):
    return [(word, lookup_tfidf_weight(tfidf, storage_id, word)) for word in doc.split()]

def create_tfidf_dataset(df: pd.DataFrame, min_unique_words=5):
    groups = df.groupby('storageId')
    corpus = groups['docContent'].first()
    tfidf = calculate_tfidf_for_dataset(corpus)
    eval_dataset = groups.first()
    eval_dataset['n_unique_words'] = eval_dataset['docContent'].str.split().apply(lambda x: np.unique(x).size)
    eval_dataset = eval_dataset[eval_dataset['n_unique_words'] >= min_unique_words]
    eval_dataset['tfidf'] = eval_dataset.apply(lambda row: apply_tfidf_weights_to_doc(tfidf, row.name, row['docContent']), axis=1)
    eval_dataset['docQuery'] = eval_dataset.apply(lambda row: get_query_from_doc(row, get_n_words_to_extract(row)), axis=1)
    return eval_dataset

def get_n_words_to_extract(row, percent=0.3):
    n_unique_words = row['n_unique_words']
    n = round(n_unique_words * percent)
    return n

def get_query_from_doc(row, n):
    unique_weighed_words = list(dict.fromkeys(row['tfidf']))
    max_n_scored_words = sorted(unique_weighed_words, key=lambda word_weight: word_weight[1], reverse=True)[:n]
    query = ' '.join([word for word, _ in max_n_scored_words])
    return query

In [5]:
dataset = jl.read_dataset(dp.lemmatized_unique_functions_corpus)
tfidf_ds = create_tfidf_dataset(dataset)
# jl.write_dataset(tfidf_ds[['docType', 'docQuery']], 'test-tfidf-evalset.jsonl')


Unnamed: 0_level_0,Unnamed: 1_level_0,docContent,docItem,docType
docId,storageId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
157976903,0,lookup function partially safe min n index,(!!!) :: [a] -> Int -> Maybe a,:: [a] -> Int -> Maybe a
174280492,1,like selects nth element x wrap end x map,(!!!) :: [a] -> Int -> a,:: [a] -> Int -> a
104624615,2,x n return element x nb vector element ascend ...,"(!!) :: (KnownNat n, Enum i) => Vec n a -> i -> a",":: (KnownNat n, Enum i) => Vec n a -> i -> a"
104513233,2,x n return element x nb vector element ascend ...,"(!!) :: (KnownNat n, Enum i) => Vec n a -> i -> a",":: (KnownNat n, Enum i) => Vec n a -> i -> a"
104438376,2,x n return element x nb vector element ascend ...,"(!!) :: (KnownNat n, Enum i) => Vec n a -> i -> a",":: (KnownNat n, Enum i) => Vec n a -> i -> a"
...,...,...,...,...
44388021,215651,π pi greek small letter pi,π :: Floating α => α,:: Floating α => α
93307248,215652,imaginary unit eisenstein integer ω sqrt 3 ι e...,ω :: EisensteinInteger,:: EisensteinInteger
44391036,215653,epsilon empty word list ε greek small letter e...,𝜀 :: [a],:: [a]
44382834,215653,epsilon empty word list ε greek small letter e...,𝜀 :: [a],:: [a]
