In [8]:
import pandas as pd
import re
import nltk
from keybert._model import KeyBERT

In [3]:
kw_model = KeyBERT()

In [22]:
def split_words(x: pd.DataFrame) -> pd.DataFrame:
    x = x.replace('\'','')
    return x.split(' ')

def remove_keywords(row: pd.DataFrame) -> pd.DataFrame:
    for key in row['key_orig']:
        row['text'] = row['text'].replace(key,'')
    return row

spec_stopwords = ['aus', 'australian', 'australia']
def get_tags(x: pd.DataFrame) -> pd.DataFrame:
    keywords = kw_model.extract_keywords(x['text'],
                                     keyphrase_ngram_range=(1, 3),
                                     stop_words='english',
                                     use_mmr=True, 
                                     diversity=0.2)
    if len(keywords) == 0:
        keywords = kw_model.extract_keywords(x['text'], keyphrase_ngram_range=(1, 3), stop_words='english', use_mmr=False)
    return keywords if len(keywords) > 0 else [('[NoneTag]', 0)]

def agg_tags(input_df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    reg_rule = r'\((.*?)\)'
    input_df[target_col] = input_df[target_col].astype(str)

    # Extract and reformat term-weight set
    tmp_df = input_df[target_col].str.extractall(reg_rule).reset_index().reset_index()
    split_result = tmp_df[0].str.split(',', expand=True).rename(columns={0:'key', 
                                                                         1:'value'}).reset_index()
    
    merge_df = tmp_df.merge(split_result, on='index').drop('index',axis=1).rename(columns={'level_0': 'items'})
    del tmp_df, split_result
    
    merge_df['key'] = merge_df['key'].map(split_words)
    mapping_df = merge_df.explode('key')[['items', 'key', 'value']]
    mapping_df['key_orig'] = mapping_df['key']
    mapping_df['key'] = mapping_df['key'].map(nltk.PorterStemmer().stem)
    
    # Compute weight according to each word and ordering
    mapping_df['value'] = mapping_df['value'].astype(float)
    sum_df = mapping_df.groupby(['items','key', 'key_orig'])['value'].sum().reset_index().sort_values(['items','value'], ascending=False)
    
    # Sampling top key
    key_df = sum_df.groupby(['items']).head(1)[['items','key']]
    merge_df = sum_df.merge(key_df, on = ['items', 'key'])
    return merge_df


def get_label_by_iteration(input_df: pd.DataFrame, pk: str, text_col: str, interation = 5):
    input_df[text_col] = input_df[text_col].str.lower()
    input_df[text_col] = input_df[text_col].map(lambda x: re.sub(r'\s*(\.:,|\d+)\s*', '', x))
    display(input_df[text_col].unique())
    input_df = input_df.reset_index(drop=True).reset_index()

    for i in range(interation):
        input_df['raw_result'] = input_df.apply(get_tags, axis=1)
        
        merge_df = agg_tags(input_df, 'raw_result')
        key_df = merge_df.groupby(['items','key'])['key_orig'].apply(lambda x: list(set(x))).reset_index()

        first_df = input_df[['index', 'text']].merge(key_df, left_on='index', right_on='items', how='left')

        input_df = input_df.drop('text', axis=1)
        first_df = first_df[first_df['key_orig'].notna()].apply(remove_keywords, axis=1).rename(columns={'key':f'key_{i}'}).drop(['items', 'key_orig'], axis=1)
        input_df = input_df.merge(first_df, on='index', how='left')
    return input_df

In [23]:
rel_df = pd.read_csv('./dataset/tenders/relevant.csv')
rel_df['text'] = rel_df['Description'] + '.' + rel_df['Title']

In [30]:
df = pd.concat([rel_df.iloc[542],rel_df.iloc[546], rel_df.iloc[1411], rel_df.iloc[1423], rel_df.iloc[1822]], axis=1).T

In [31]:
input_df = get_label_by_iteration(df, '_id', 'text')

array(['survey australia-wide: drug use and related issues.    two modes: drop-and-collect (~   respondents) and/or cati (~   )-tender for either or both.    project manager: the australian institute of health and welfare. national drug strategy household survey',
       'we are seeking a dynamic helpful highly capable provider with an excellent track record in the delivery of iso : certification of complex service oriented business processes and the development and implementation of sustainable corporate quality systems.  the organisation will become a strategic partner with ip australia along its quality journey and provide expert assistance to the cqo in the delivery of quality outcomes..provision of iso : advisory services',
       'invitation to register to deliver the training requirements of the shadow tuas. .shadow tuas training',
       'the department invites suitably qualified consultants/contractors to submit a tender for the review of the procurement statement to determine

In [32]:
input_df

Unnamed: 0,index,ATM ID,Category,Description,Title,_id,raw_result,key_0,key_1,key_2,key_3,text,key_4
0,0,2006-003,Management advisory services,"Survey, Australia-wide: drug use and related i...",2007 National Drug Strategy Household Survey,6162aa1fe1b7f5c73e6fe1ee,"[('australia wide use', 0.5302), ('drop collec...",survey,drug,cati,respond,australia-wide: use and related issues. t...,collect
1,1,IPAC2006/13686,Management advisory services,"We are seeking a dynamic, helpful, highly capa...",Provision of ISO 9001:2000 Advisory Services,6162aa1fe1b7f5c73e6fe1f2,"[('assistance cqo delivery', 0.5427), ('servic...",iso,certif,qualiti,busi,we are seeking a dynamic helpful highly capabl...,assist
2,2,DMOASD/ITR0269/2011,Education and Training Services,Invitation to Register to deliver the training...,Shadow 200 TUAS Training,6162aa1fe1b7f5c73e6fe553,"[('invitation deliver requirements', 0.822), (...",shadow,tua,train,regist,invitation to to the requirements of the ....,deliv
3,3,PRI-00002374,Management advisory services,The Department invites suitably qualified cons...,Review of 2009 Procurement Statement,6162aa1fe1b7f5c73e6fe55f,"[('appropriateness efficiency integration', 0....",procur,effect,review,statement,the department invites suitably qualified cons...,appropri
4,4,RFT PRN AD15003424,Education and Training Services,"Develop, disseminate and ensure widespre...",Mathematics by Inquiry,6162aa1fe1b7f5c73e6fe6ee,"[('teachers school inquiry', 0.5667), ('resour...",mathemat,foundat,leader,teach,develop disseminate and ensure widesprea...,resourc
