In [1]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
from nltk.corpus import stopwords
# from nltk.lemmatize import 

df_file = pd.read_csv('korpus.csv')

In [None]:
# Tokenizer, Stopword removal. Punctuation remover
stop_words = set(stopwords.words('danish'))
df_tokens = pd.DataFrame(columns=['file', 'clean_token'])

# IN: String, OUT = Lst
def TextPrep (file, content, df_tokens):
    # content = input['content']
    # file = input['file']
    
    punct_removal = re.sub(r"[^\w\s]", " ", content.lower())
    token_lst = word_tokenize(punct_removal)
    token_lst_swrm =[token for token in token_lst if token not in stop_words]

    append_rows = pd.DataFrame({'file': [file] * len(token_lst_swrm), 'clean_token': token_lst_swrm})

    return pd.concat([df_tokens, append_rows], ignore_index=True)

for file in df_file.iterrows():
    file_name = file[1]['file']
    file_content = file[1]['content']
    df_tokens = TextPrep(file_name, file_content, df_tokens)




In [3]:
import lemmy
lemmyier = lemmy.load('da')

def Lemmatizer (df_tokens):
    df_tokens['lemmatized_tokens'] = df_tokens['clean_token'].apply(lambda x: lemmyier.lemmatize('', x)[0])
    return df_tokens

df_tokens = Lemmatizer(df_tokens)

In [4]:
import math
tf = df_tokens.groupby(['file', 'lemmatized_tokens']).size().reset_index(name='tf')
n = df_tokens.groupby('lemmatized_tokens')['file'].nunique().reset_index(name='n')
N = df_tokens['file'].nunique()

n_dict = dict(zip(n['lemmatized_tokens'], n['n']))  
idf = {token: math.log(N / n_dict[token], 10) for token in n_dict}

tfidf_df = tf.merge(n, on='lemmatized_tokens')
tfidf_df['tfidf'] = tfidf_df.apply(
    lambda row: row['tf'] * math.log((N+1) / row['n'] + 1, 10),
    axis=1
)
df_tokens = df_tokens.merge(tfidf_df, on=['file', 'lemmatized_tokens'], how='left')
df_tokens = df_tokens.drop_duplicates(subset=['file', 'lemmatized_tokens'])


In [5]:
#https://github.com/tormand86/hunspell-danish
import enchant
dk_dict = enchant.Dict('da_DK')

def CompoundSplitter(token, recursion_depth = 2):
    splits = []
    for i in range(3, len(token) - 3):  
        fst, snd = token[:i], token[i:]
        if dk_dict.check(fst) and dk_dict.check(snd):
            splits.append([recursion_depth, fst])
            splits.append([recursion_depth, snd])

            recurse_snd = CompoundSplitter(snd, recursion_depth + 1)
            if recurse_snd:
                splits.extend(recurse_snd)

            recurse_fst = CompoundSplitter(fst, recursion_depth + 1)
            if recurse_fst:
                splits.extend(recurse_fst)

    return splits if splits else None


In [6]:
def RemoveDupes(splits):
    unique_splits = {}
    for split in splits:
        # Key is the tuple of the words (ignore depth)
        key = tuple(split[1:])
        depth = split[0]
        # Keep if new or if smaller depth than existing
        if key not in unique_splits or depth < unique_splits[key][0]:
            unique_splits[key] = split
    # Return list of unique splits
    return list(unique_splits.values())

In [7]:
def SplitSorter(token):
    split = CompoundSplitter(token)
    if split is None:
        split = []
    split.append([1, token])
    try:
        split = RemoveDupes(split)
        sorted_split = sorted(split, key=lambda x: x[0])
        return sorted_split
    except:
        return None
    
df_tokens['root'] = df_tokens['lemmatized_tokens'].apply(lambda x: SplitSorter(x))

In [8]:
# [['medicin', 'gruppe'], ['medicing', 'ruppe'], ['med', 'icing']]


#[['binyre', 'barkhormonpræparat'], ['bark', 'hormonpræparat']]

SplitSorter('binyrebarkhormonpræparat')

[[1, 'binyrebarkhormonpræparat'],
 [2, 'binyre'],
 [2, 'barkhormonpræparat'],
 [2, 'hormonpræparat'],
 [2, 'præparat'],
 [2, 'binyrebark'],
 [2, 'binyrebarkhormon'],
 [3, 'bark'],
 [3, 'hor'],
 [3, 'monpræparat'],
 [3, 'præ'],
 [3, 'parat'],
 [3, 'hormon'],
 [3, 'hormonpræ'],
 [3, 'barkhor'],
 [3, 'barkhormon'],
 [3, 'biny'],
 [3, 'rebark'],
 [4, 'mon'],
 [4, 'monpræ']]

In [9]:
def KeywordSearch(keyword, df_tokens):

    keyword_components = SplitSorter(keyword)
    hit_rows = []
    
    dumbass_breaker = False
    for i, row in df_tokens.iterrows():
        for key_comp in keyword_components:
            for root_comp in row['root']:
                if key_comp[1] == root_comp[1]:
                    print(root_comp, key_comp)

                    divisor =  root_comp[0] * key_comp[0]
                    print(divisor)
                    hit_score = (row['tfidf']) / divisor

                    hit_rows.append({
                        'index': i, 
                        'file': row['file'], 
                        'search word': row['lemmatized_tokens'], 
                        'hitword': row['root'][0][1],
                        'key_component': key_comp[1], 
                        'root_component': root_comp[1], 
                        'tf_idf': row['tfidf'], 
                        'tf_idf_adjusted': hit_score})
                    dumbass_breaker = True
                    break
            if dumbass_breaker:
                dumbass_breaker = False
                break


    return pd.DataFrame(hit_rows)


In [10]:

def ExpressionDecomp(search, df_tokens):

    df_all_hits = pd.DataFrame()
    search_punct_removal = re.sub(r"[^\w\s]", " ", search.lower())
    search_to_token_lst = word_tokenize(search_punct_removal)
    keywords =[token for token in search_to_token_lst if token not in stop_words]

    for keyword in keywords:
        df_temp = KeywordSearch(keyword, df_tokens)
        df_all_hits = pd.concat([df_all_hits, df_temp])
    return df_all_hits
        
a = ExpressionDecomp('sår diabetes', df_tokens)

[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[1, 'sår'] [1, 'sår']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2


In [11]:
   
def SearchResult (quary, df_tokens):
    df_hits = ExpressionDecomp(quary, df_tokens)
    return (df_hits.groupby('file')['tf_idf_adjusted'].sum())

b = SearchResult('sår diabetes', df_tokens)

[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[1, 'sår'] [1, 'sår']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2


In [12]:
gold_standard = {
    'sår diabetes': {
        'Hælsår.txt',
        'Neuropatiske og neuroiskæmiske sår.txt',
        'Charcotfod.txt',
        'Diabetisk neuropati.txt',
        'Behandling for diabetes 2.txt',
        'Netdoktor.txt'
    },
    'livsstilssygdom': {
        'Livsstil.txt',
        'Risiko.txt',
        'Behandling for diabetes 2.txt',
        'Motion.txt',
        'Mad.txt',
        'Diabetes hos børn.txt'
    },
    'type 2-diabetes': {
        'Behandling for diabetes 2.txt',
        'Risiko.txt',
        'Motion.txt',
        'Mad.txt',
        'Livsstil.txt',
        'Diabetisk neuropati.txt'
    },
    'kost sukkersyge': {
        'Mad.txt',
        'Behandling for diabetes 2.txt',
        'Livsstil.txt',
        'Risiko.txt',
        'Motion.txt',
        'Diabetes hos børn.txt'
    }
}

In [13]:
results = {}
def Searcher (quary, df_tokens):
    result = SearchResult(quary, df_tokens)
    result_sort = result.sort_values(ascending=False)
    threshhold = result_sort.mean()

    result_thresh = result_sort[result_sort >= threshhold] 
    retrieved_docs = list(result_thresh.index)  # all files that had hits
    return retrieved_docs

for quary in gold_standard:
    results[quary] = Searcher(quary, df_tokens)

    

[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[2, 'sår'] [1, 'sår']
2
[1, 'sår'] [1, 'sår']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[1, 'diabetes'] [1, 'diabetes']
1
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'diabetes'] [1, 'diabetes']
2
[2, 'sygdom'] [2, 'sygdom']
4
[2, 'livs'] [3, 'livs']
6
[2, 'livs'] [3, 'livs']
6
[2, 'sygdom'] [2, 'sygdom']
4
[2, 'livs'] [3, 'livs']
6
[2, 

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

def EvalQuary(gold_set, retrieved_list):
    retrieved_clean = [doc for doc in retrieved_list if doc is not None]

    # Union of all docs involved
    comparison_set = list(set(gold_set) | set(retrieved_clean))

    gold_vector = [1 if doc in gold_set else 0 for doc in comparison_set]
    retrieved_vector = [1 if doc in retrieved_clean else 0 for doc in comparison_set]

    precision = precision_score(gold_vector, retrieved_vector, zero_division=0)
    recall = recall_score(gold_vector, retrieved_vector, zero_division=0)
    f1 = f1_score(gold_vector, retrieved_vector, zero_division=0)

    return precision, recall, f1



In [20]:
scores = []

for query in gold_standard:
    gold = gold_standard[query]
    retrieved = results.get(query, [])[:6] 

    precision, recall, f1 = EvalQuary(gold, retrieved)
    scores.append((query, precision, recall, f1))

    print(f"{query}: Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}")


sår diabetes: Precision=0.50, Recall=0.33, F1=0.40
livsstilssygdom: Precision=0.60, Recall=0.50, F1=0.55
type 2-diabetes: Precision=0.75, Recall=0.50, F1=0.60
kost sukkersyge: Precision=1.00, Recall=0.50, F1=0.67


In [16]:
avg_precision = np.mean([s[1] for s in scores])
avg_recall = np.mean([s[2] for s in scores])
avg_f1 = np.mean([s[3] for s in scores])

print("\nAverage Scores (Top 6 only):")
print(f"Precision: {avg_precision:.2f}")
print(f"Recall:    {avg_recall:.2f}")
print(f"F1 Score:  {avg_f1:.2f}")



Average Scores (Top 6 only):
Precision: 0.71
Recall:    0.46
F1 Score:  0.55
