In [1]:
import re
from typing import List, Tuple
import nltk
import numpy as np
import pandas as pd
from keybert._model import KeyBERT
from nltk.corpus import stopwords

In [10]:
RE_SYMBOL = "[`~!@#$^&*()=|{}':;',\\[\\].<>/?~！@#￥……&*（）\-–——|{}【】‘’；：”“'。，、？%+_]"
RE_UPPER = '[A-Z]{2,}'
RE_WEIGHT_RULE = r'\((.*?)\)'
PROJECT_STOP_WORDS = [
    "activity",
    "agreement",
    "aim",
    "australasian",
    "australia",
    "australian",
    "eligible",
    "government",
    "grant",
    "level",
    "list",
    "mission",
    "objective",
    "priority",
    "processing",
    "program",
    "programme",
    "project",
    "refer",
    "research",
    "round",
    "service",
    "tender",
    "title",
    "-",
    "-",
    "/",
    "’",
    "•"
  ]

In [3]:
kw_model = KeyBERT()

In [11]:
NN = 0
class KeyExtractor:
    def __init__(self, kw_model):
        self.STOP_WORDS = stopwords.words('english') + PROJECT_STOP_WORDS
        self.kw_model = kw_model

    def __preprocess(self, text: str):
        text = re.sub(RE_SYMBOL, ' ', text)
        text = re.sub(RE_UPPER, '', text)
        return text.lower()

    def __split_words(self, x: pd.DataFrame) -> pd.DataFrame:
        x = x.replace('\'', '')
        return x.split()

    def __remove_keywords(self, row: pd.DataFrame) -> pd.DataFrame:
        row['text'] = row['text'].replace(row['key'], '')
        return row

    def __get_tags(self, df: pd.DataFrame) -> List[Tuple[str, float]]:
        '''

        Parameters
        ----------
        df: pd.DataFrame, input dataframe

        This function will call KeyBert to extract the keyword from text description of a
        tender, return a list with tuple[keyword, weight]. There are three keyword extraction
        strategies:
        1.  When the text description is intact, Maximal Marginal Relevance function with a low
            diversity value will be called to extract several similar keywords or phrases.
        2.  If above function return empty list, the normal keyword extraction function will be
            called to extract single key word from text.
        3.  If above two functions cannot extract any keyword, an empty tag "[none_tag]" will be
            returned.

        Returns
        -------
        List[Tuple[str, float]], list of key-weight tuples, with key as extracted keywords,
        weight as the assessed weight of the corresponding keyword.
        '''

        keywords = self.kw_model.extract_keywords(df['text'],
                                                  keyphrase_ngram_range=(1, 3),
                                                  stop_words='english',
                                                  use_mmr=True,
                                                  diversity=0.2)
        if not keywords: return [('[none_tag]', 1)]
        if len(keywords) == 0:
            keywords = self.kw_model.extract_keywords(df['text'], keyphrase_ngram_range=(1, 3), stop_words='english')
        return keywords if len(keywords) > 0 else [('[none_tag]', 1)]

    def __convert_word_type(self, text):
        text = self.__preprocess(text)
        token = nltk.word_tokenize(text)
        lemmatizer = nltk.stem.WordNetLemmatizer()
        pos_tagged = nltk.pos_tag(token)
        words = []
        Noun = list(filter(
            lambda x: x[0] not in self.STOP_WORDS and x[0] != '/' and (x[1].startswith('NN') or x[1].startswith('JJ')),
            pos_tagged))
        for word, pos in Noun:
            if pos.startswith('NN'):
                word = lemmatizer.lemmatize(word, pos='n')
            elif pos.startswith('JJ'):
                word = lemmatizer.lemmatize(word, pos='a')
            words.append(word)
        text = " ".join(words)
        return text

    def __agg_tags(self, input_df: pd.DataFrame, target_col: str) -> pd.DataFrame:
        '''

        Parameters
        ----------
        input_df: pd.DataFrame, input dataframe
        target_col: str, KeyBert result column for aggregation,

        This function will count the total weight for each single stem by
        tenders, according to the result from KeyBert. Weight from same stem
        will be added and the stem with the highest weight for each tenders
        will be returned as the candidate tag. One thing to node, if there are
        few words in a tender that pointed to the highest stem
        (e.g., orig_word: [manager, management, managing] -> stem: [manage]),
        all these words will be returned.

        Returns
        -------
        A dataframe with the shape of n*4. PKs are [items, key_orig]
        Columns: 'items': matching with each tenders project.
                 'key': extracted stem with the highest weight.
                 'key_orig': original word corresponding to the key.
                 'value': total weight for the final stem
        '''

        input_df[target_col] = input_df[target_col].astype(str)

        # Extract and reformat term-weight set
        tmp_df = input_df[target_col].str.extractall(RE_WEIGHT_RULE).reset_index().reset_index()
        split_result = tmp_df[0].str.split(',', expand=True).rename(columns={0: 'key',
                                                                             1: 'value'}).reset_index()

        merge_df = tmp_df.merge(split_result, on='index').drop('index', axis=1).rename(columns={'level_0': 'items'})
        del tmp_df, split_result

        merge_df['key'] = merge_df['key'].map(self.__split_words)
        mapping_df = merge_df.explode('key')[['items', 'key', 'value']]

        # Compute weight according to each word and ordering
        mapping_df['value'] = mapping_df['value'].astype(float)
        sum_df = mapping_df.groupby(['items', 'key'])['value'].sum().reset_index().sort_values(
            ['items', 'value'], ascending=False)

        # Sampling top key
        removed_df = sum_df[~sum_df['key'].isin(PROJECT_STOP_WORDS)]
        removed_df = sum_df if sum_df.empty else removed_df
        key_df = sum_df.groupby(['items']).head(1)[['items', 'key']]
        merge_df = sum_df.merge(key_df, on=['items', 'key'])
        return merge_df

    def extract_label(self, df, pk: str, iteration_time: int):
        # Get keywords in one around
        df['raw_result'] = df.apply(self.__get_tags, axis=1)

        # Aggregate keywords
        merge_df = self.__agg_tags(df, 'raw_result')
        first_df = df[['index', 'text']].merge(merge_df, left_on='index', right_on='items', how='left')

        # Generating new text
        df = df.drop('text', axis=1)
        first_df = first_df[first_df['key'].notna()].apply(self.__remove_keywords, axis=1).rename(
            columns={'key': f'key_{iteration_time}'}).drop('items', axis=1)
        df = df.merge(first_df, on='index', how='left')

        return df[[pk, f'key_{iteration_time}', 'text']]

    def get_label_by_iteration(self, input_df: pd.DataFrame, pk: str, text_col: str, iterations=10) -> pd.DataFrame:
        '''

        Parameters
        ----------
        input_df: pd.DataFrame,
        pk: str, name of the primary key for the input_df
        text_col: str, name of tenders' text description column
        iterations: iterations for extraction process

        This function will generate iteration times of keywords for each tender,
        keys may include empty value if KeyBert cannot extract any keyword from
        the text.

        Returns
        -------
        pd.DataFrame, original dataframe appending with key columns:
                                [key_0, key_1, key_2...]
        '''

        assert text_col in input_df.columns, f'Missing column names "{text_col}".'
        tmp_df = input_df[input_df[text_col].notna()].copy()
        tmp_df[text_col] = tmp_df[text_col].map(lambda x: self.__preprocess(x))
        tmp_df[text_col] = tmp_df[text_col].map(lambda x: self.__convert_word_type(x))

        # Remove pure numbers, e.g., 2014, 20,000
        tmp_df[text_col] = tmp_df[text_col].map(lambda x: re.sub(r'\s*(\.:,|\d+)\s*', '', x))
        tmp_df = tmp_df.reset_index(drop=True).reset_index()
        tmp_df = tmp_df[[pk, text_col, 'index']]
        
        for i in range(iterations):
            tmp_df = self.extract_label(tmp_df, pk, i)
            input_df['text'] = tmp_df['text']
            input_df = input_df.merge(tmp_df[['_id', f'key_{i}']], on=pk, how='left')
            tmp_df = tmp_df[(tmp_df[f'key_{i}']!='[none_tag]')&(tmp_df['text'].notna())].reindex().reset_index()

        input_df = input_df.replace('[none_tag]', np.nan)
        return input_df

In [12]:
input_df = pd.read_csv('Data/tenders_info.csv')
input_df['Description'] = input_df['Description'].fillna('')
input_df['text'] = input_df['Description'] + '.' + input_df['Title']

In [13]:
ke = KeyExtractor(kw_model)

In [14]:
return_df = ke.get_label_by_iteration(input_df[1837:1839], '_id', 'text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df['text'] = tmp_df['text']


In [8]:
return_df.to_csv('tt.csv', index=0, encoding='utf-8_sig')

In [15]:
return_df

Unnamed: 0,_id,Agency,Title,Publish Date,Close Date,Category,Category_sub,Description,Eligibility,Value,...,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9
0,6162aa1fe1b7f5c73e6fe07a,,National Preschool Census - Aboriginal and Tor...,,,Education and Training Services,,The objective of the National Preschool Census...,,,...,preschool,census,aboriginal,strait,islander,national,extension,year,possible,
1,6162aa1fe1b7f5c73e6fe07b,,Request for Tender for the Provision of an Ind...,,,Education and Training Services,,The RFT seeks to proposals to engage Indigenou...,,,...,youth,employment,indigenous,consultant,department,workplace,relation,proposal,request,
