In [1]:
import re
from typing import List, Tuple
import nltk
import numpy as np
import pandas as pd
from keybert._model import KeyBERT
from nltk.corpus import stopwords

In [2]:
RE_SYMBOL = "[`~!@#$^&*()=|{}':;',\\[\\].<>/?~！@#￥……&*（）\-–——|{}【】‘’；：”“'。，、？%+_]"
RE_UPPER = '[A-Z]{2,}'
RE_WEIGHT_RULE = r'\((.*?)\)'
PROJECT_STOP_WORDS = ['grant', 'tender', 'service', '-', 'australian', 'program', 'australasian',
                      'project', '/', '’', '•', 'australia', 'research', 'title']

In [3]:
kw_model = KeyBERT()

In [46]:
NN = 0
class KeyExtractor:
    def __init__(self, kw_model):
        self.STOP_WORDS = stopwords.words('english') + PROJECT_STOP_WORDS
        self.kw_model = kw_model

    def __preprocess(self, text: str):
        text = re.sub(RE_SYMBOL, ' ', text)
        text = re.sub(RE_UPPER, '', text)
        return text.lower()

    def __split_words(self, x: pd.DataFrame) -> pd.DataFrame:
        x = x.replace('\'', '')
        return x.split()

    def __remove_keywords(self, row: pd.DataFrame) -> pd.DataFrame:
        row['text'] = row['text'].replace(row['key'], '')
        return row

    def __get_tags(self, df: pd.DataFrame) -> List[Tuple[str, float]]:
        '''

        Parameters
        ----------
        df: pd.DataFrame, input dataframe

        This function will call KeyBert to extract the keyword from text description of a
        tender, return a list with tuple[keyword, weight]. There are three keyword extraction
        strategies:
        1.  When the text description is intact, Maximal Marginal Relevance function with a low
            diversity value will be called to extract several similar keywords or phrases.
        2.  If above function return empty list, the normal keyword extraction function will be
            called to extract single key word from text.
        3.  If above two functions cannot extract any keyword, an empty tag "[none_tag]" will be
            returned.

        Returns
        -------
        List[Tuple[str, float]], list of key-weight tuples, with key as extracted keywords,
        weight as the assessed weight of the corresponding keyword.
        '''

        keywords = self.kw_model.extract_keywords(df['text'],
                                                  keyphrase_ngram_range=(1, 3),
                                                  stop_words='english',
                                                  use_mmr=True,
                                                  diversity=0.2)
        if not keywords: return [('[none_tag]', 1)]
        if len(keywords) == 0:
            keywords = self.kw_model.extract_keywords(df['text'], keyphrase_ngram_range=(1, 3), stop_words='english')
        return keywords if len(keywords) > 0 else [('[none_tag]', 1)]

    def __convert_word_type(self, text):
        text = self.__preprocess(text)
        token = nltk.word_tokenize(text)
        lemmatizer = nltk.stem.WordNetLemmatizer()
        pos_tagged = nltk.pos_tag(token)
        # print(pos_tagged)
        words = []
        Noun = list(filter(
            lambda x: x[0] not in self.STOP_WORDS and x[0] != '/' and (x[1].startswith('NN') or x[1].startswith('JJ')),
            pos_tagged))
        for word, pos in Noun:
            if pos.startswith('NN'):
                word = lemmatizer.lemmatize(word, pos='n')
            elif pos.startswith('JJ'):
                word = lemmatizer.lemmatize(word, pos='a')
            words.append(word)
        text = " ".join(words)
        return text

    def __agg_tags(self, input_df: pd.DataFrame, target_col: str) -> pd.DataFrame:
        '''

        Parameters
        ----------
        input_df: pd.DataFrame, input dataframe
        target_col: str, KeyBert result column for aggregation,

        This function will count the total weight for each single stem by
        tenders, according to the result from KeyBert. Weight from same stem
        will be added and the stem with the highest weight for each tenders
        will be returned as the candidate tag. One thing to node, if there are
        few words in a tender that pointed to the highest stem
        (e.g., orig_word: [manager, management, managing] -> stem: [manage]),
        all these words will be returned.

        Returns
        -------
        A dataframe with the shape of n*4. PKs are [items, key_orig]
        Columns: 'items': matching with each tenders project.
                 'key': extracted stem with the highest weight.
                 'key_orig': original word corresponding to the key.
                 'value': total weight for the final stem
        '''

        input_df[target_col] = input_df[target_col].astype(str)

        # Extract and reformat term-weight set
        tmp_df = input_df[target_col].str.extractall(RE_WEIGHT_RULE).reset_index().reset_index()
        split_result = tmp_df[0].str.split(',', expand=True).rename(columns={0: 'key',
                                                                             1: 'value'}).reset_index()

        merge_df = tmp_df.merge(split_result, on='index').drop('index', axis=1).rename(columns={'level_0': 'items'})
        del tmp_df, split_result

        merge_df['key'] = merge_df['key'].map(self.__split_words)
        mapping_df = merge_df.explode('key')[['items', 'key', 'value']]
        # mapping_df['key'] = mapping_df['key'].map(nltk.PorterStemmer().stem)

        # Compute weight according to each word and ordering
        mapping_df['value'] = mapping_df['value'].astype(float)
        sum_df = mapping_df.groupby(['items', 'key'])['value'].sum().reset_index().sort_values(
            ['items', 'value'], ascending=False)

        # Sampling top key
        key_df = sum_df.groupby(['items']).head(1)[['items', 'key']]
        merge_df = sum_df.merge(key_df, on=['items', 'key'])
        return merge_df

    def extract_label(self, df, pk: str, iteration_time: int):
        # Get keywords in one around
        df['raw_result'] = df.apply(self.__get_tags, axis=1)

        # Aggregate keywords
        merge_df = self.__agg_tags(df, 'raw_result')
        first_df = df[['index', 'text']].merge(merge_df, left_on='index', right_on='items', how='left')

        # Generating new text
        df = df.drop('text', axis=1)
        first_df = first_df[first_df['key'].notna()].apply(self.__remove_keywords, axis=1).rename(
            columns={'key': f'key_{iteration_time}'}).drop('items', axis=1)
        df = df.merge(first_df, on='index', how='left')

#         assert len(df) == len(input_df), \
#             f'Tenders {set(input_df[pk].unique().tolist()) - set(df[pk].unique().tolist())} lost'
        return df[[pk, f'key_{iteration_time}', 'text']]

    def get_label_by_iteration(self, input_df: pd.DataFrame, pk: str, text_col: str, iterations=10) -> pd.DataFrame:
        '''

        Parameters
        ----------
        input_df: pd.DataFrame,
        pk: str, name of the primary key for the input_df
        text_col: str, name of tenders' text description column
        iterations: iterations for extraction process

        This function will generate iteration times of keywords for each tender,
        keys may include empty value if KeyBert cannot extract any keyword from
        the text.

        Returns
        -------
        pd.DataFrame, original dataframe appending with key columns:
                                [key_0, key_1, key_2...]
        '''

        assert text_col in input_df.columns, f'Missing column names "{text_col}".'
        tmp_df = input_df[input_df[text_col].notna()].copy()
        tmp_df[text_col] = tmp_df[text_col].map(lambda x: self.__preprocess(x))
        tmp_df[text_col] = tmp_df[text_col].map(lambda x: self.__convert_word_type(x))

        # Remove pure numbers, e.g., 2014, 20,000
        tmp_df[text_col] = tmp_df[text_col].map(lambda x: re.sub(r'\s*(\.:,|\d+)\s*', '', x))
        tmp_df = tmp_df.reset_index(drop=True).reset_index()
        tmp_df = tmp_df[[pk, text_col, 'index']]
        
        for i in range(iterations):
            tmp_df = self.extract_label(tmp_df, pk, i)
            input_df['text'] = tmp_df['text']
            input_df = input_df.merge(tmp_df[['_id', f'key_{i}']], on=pk, how='left')
            tmp_df = tmp_df[(tmp_df[f'key_{i}']!='[none_tag]')&(tmp_df['text'].notna())].reindex().reset_index()
            print(input_df['text'])

        input_df = input_df.replace('[none_tag]', np.nan)
        return input_df

In [47]:
input_df = pd.read_csv('Data/tenders_info.csv')
input_df['Description'] = input_df['Description'].fillna('')
input_df['text'] = input_df['Description'] + '.' + input_df['Title']

In [48]:
ke = KeyExtractor(kw_model)

In [51]:
return_df = ke.get_label_by_iteration(input_df, '_id', 'text')

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
30    NaN
31    NaN
32    NaN
33    NaN
34    NaN
35    NaN
36    NaN
37    NaN
38    NaN
39    NaN
Name: text, dtype: object
0     prime   science prestigious regarded award out...
1     dynamic helpful capable provider excellent tra...
2     business innovation initiative stage challenge...
3       protection grant major political party secur...
4       grant   grant opportunity funding   activity...
5      communication information technology art  pro...
6     transformation trust confidence  tax superannu...
7     purpose opportunity digital   disease web web ...
8      funding individual  project  significant comm...
9     vanuatu   support phase phase au implementatio...
10    laborato

0     prime    prestigious regarded award outstandin...
1     dynamic helpful  provider excellent track reco...
2     business   stage   minister industry science  ...
3       protection  major political party   manageme...
4           opportunity     involvement sacrifice  w...
5      communication information technology art  pro...
6     transformation trust     system real outcome  ...
7     purpose opportunity     web web platform infor...
8       individual    significant   town centenary c...
9     vanuatu         rural road maintenance priorit...
10    tender      standard materiel issue movement  ...
11    government  national   management   disposal l...
12    department education  department  small   smal...
13    important advance  prevention treatment    peo...
14    constraint genetic improvement    tolerance   ...
15       risk   ability prepare respond rapid slow o...
16                             government involvement  
17             department qualified      council

0     prime    prestigious regarded  outstanding  sc...
1     dynamic   provider  track record delivery  com...
2        stage   minister industry science     round...
3         major political     system   pertaining  r...
4           opportunity     involvement      operati...
5      communication information technology art    r...
6     transformation      system real outcome      s...
7     purpose      web web platform information     ...
8       individual    significant   town  celebratio...
9                maintenance priority short term job...
10                      tender           system version
11      national      disposal low level  temporary ...
12    department education  department  small   smal...
13    important    treatment    people      evidence...
14                                   course      course
15    objective      innovation support priority  gr...
16    department qualified      practice provision s...
17    early intervention equips empowers eligibl

In [50]:
return_df.to_csv('tt.csv', index=0, encoding='utf-8_sig')