# Libraries

In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval
from transformers import BertTokenizer, BertModel
import networkx as nx

from src.controller import generator, trainer
from src.lib import extractor

# Data preparation

In [4]:
DATA_PATH = './data/'

In [5]:
def prepare_data():
    ''' This function reads data from the existing data source'''
    global DATA_PATH
    
    doc_df = pd.read_csv('{}raw_data.csv'.format(DATA_PATH))
    ed_df = pd.read_csv('{}ep_df.csv'.format(DATA_PATH))
    
    with open('{}stopword.txt'.format(DATA_PATH)) as f:
        stopwords = literal_eval(f.read())
    
    with open('{}topics.txt'.format(DATA_PATH)) as f:
        topics = literal_eval(f.read())
    
    return doc_df, ed_df, stopwords, topics

In [6]:
doc_df, ed_df, stopwords, topics = prepare_data()

In [7]:
display(doc_df)
display(ed_df)

Unnamed: 0,doc_id,text
0,d1,Finding similar questions in large question an...
1,d2,Evaluating Document Clustering for Interactive...
2,d3,Automatically classifying database workloads. ...
3,d4,A new approach to intranet search based on inf...


Unnamed: 0,doc_id,exp_id,weight
0,d1,e1,1
1,d1,e2,1
2,d2,e2,1
3,d2,e4,1
4,d3,e1,1
5,d3,e3,1
6,d4,e1,1
7,d4,e4,1


# Data generation

## Expert-document matrix

In [8]:
ed_matrix = generator.generate_ed_matrix(ed_df)

In [9]:
display(ed_matrix)

Unnamed: 0,d1,d2,d3,d4
e1,1,0,1,1
e2,1,1,0,0
e4,0,0,1,0
e3,0,1,0,1


## Document-phrase matrix

In [10]:
def dp_pipeline(doc_df, stopwords):
    ''' This function contains the pipeline for generating the 
    document-phrase matrix '''
    # Construct corpus (of tokens and noun phrases)
    corpus = doc_df['text'].values
    X_train = extractor.tokenise_doc(corpus, stopwords, max_phrase_len=3)
    
    # Generate TF for terms and noun phrases
    tf_terms = generator.generate_tf(X_train['tokens'])
    tf_phrases = generator.generate_tf(X_train['np'])
    
    # Generate document-phrase matrix
    dp_matrix = generator.generate_dp_matrix(tf_terms, tf_phrases, 
                                             doc_df['doc_id'], method="indirect")
    
    
    return pd.DataFrame(dp_matrix['matrix'].todense(),
                        index=dp_matrix['index'], columns=dp_matrix['columns'])

In [11]:
dp_matrix = dp_pipeline(doc_df, stopwords)

In [12]:
display(dp_matrix)

Unnamed: 0_level_0,access,addition,advance,advantage,allocation,analysis,answer,answer_archive,answer_service,application_workload,...,users_question,valuable_linguistic_resource,valuable_sense,web,word_overlap,work,workload,workload_characteristic,workload_classifier,workload_type
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d1,0.0,0.0,0.0,0.0,0.0,0.0,13.414035,12.45589,9.581453,0.0,...,10.539599,1.916291,0.958145,1.916291,1.916291,0.0,0.0,0.0,0.0,0.0
d2,0.0,0.0,0.0,1.916291,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.638764,1.916291,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d3,0.0,0.0,0.0,0.0,1.916291,0.0,0.0,0.0,0.0,9.581453,...,0.0,0.638764,0.0,0.0,0.0,1.916291,17.246616,9.581453,10.539599,11.497745
d4,1.916291,1.916291,1.916291,0.0,0.0,3.832582,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.665163


## Document-topic matrix

In [13]:
MODEL_PATH = './model/'

In [14]:
def dtopic_pipeline(dp_matrix, topics):
    ''' This function contaisn the pipeline for generating the 
    document-topic matrix'''
    # Load Scibert model
    MODEL_DIR = '{}scibert_scivocab_uncased'.format(MODEL_PATH)
    model = BertModel.from_pretrained(MODEL_DIR)
    tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
    
    # Prepare model dictionary
    # Note: For the pretrained vectors of phrases, you will need to read here.
    # This example does not contain pretrained vectors
    model_dict = {
        'model': model,
        'tokenizer': tokenizer,
        'trained_vectors': None
    }
    
    # Generate document-topic matrix
    dtopic_matrix, topic_phrase = generator.generate_dtop_matrix(dp_matrix, topics, 
                                                                 model_dict, top_n=1)
    topic_vec = generator.generate_topic_vector(dtopic_matrix)
    dtopic_matrix = pd.DataFrame(dtopic_matrix['matrix'].todense(),
                                 index=dtopic_matrix['index'], 
                                 columns=dtopic_matrix['columns'])
    
    return dtopic_matrix, topic_vec, topic_phrase

In [15]:
dtopic_matrix, topic_vec, topic_phrase = dtopic_pipeline(dp_matrix, topics)

In [16]:
display(topic_phrase)

{'question answer system': [('answer_service', 0.8885146)],
 'computational linguistics': [('online_transaction_processing', 0.71243674)],
 'clustering method': [('clustering_approach', 0.948566)],
 'data mining': [('question_retrieval', 0.8201956)]}

In [17]:
display(dtopic_matrix)
display(topic_vec)

Unnamed: 0_level_0,question answer system,computational linguistics,clustering method,data mining
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d1,0.454545,0.0,0.0,0.545455
d2,0.0,0.0,0.75,0.25
d3,0.0,0.666667,0.333333,0.0
d4,0.0,0.0,0.8,0.2


Unnamed: 0,weights
question answer system,0.113636
computational linguistics,0.166667
clustering method,0.470833
data mining,0.248864


## Personalised matrices

In [18]:
def personalised_pipeline(ed_df, ed_matrix, dtopic_matrix, topic_vec):
    # Generate expoert-document graph
    edges = ed_df[['doc_id', 'exp_id']].values
    G = nx.DiGraph()
    G.add_edges_from(edges, weight=1, length=0.05)
    
    # Generate personalised matrices 
    etop_matrix, dtop_matrix = generator.generate_pr_matrix(ed_matrix, 
                                                            dtopic_matrix, 
                                                            topic_vec['weights'].values, 
                                                            G, alpha=0.0)
    
    # Construct DataFrame
    etop_matrix = pd.DataFrame(etop_matrix['matrix'].todense(),
                               index=etop_matrix['index'],
                               columns=etop_matrix['columns'])
    dtop_matrix = pd.DataFrame(dtop_matrix['matrix'].todense(),
                               index=dtop_matrix['index'],
                               columns=dtop_matrix['columns'])
    
    return etop_matrix, dtop_matrix, G

In [19]:
exp_pr_df, doc_pr_df, ed_graph = personalised_pipeline(ed_df, ed_matrix, dtopic_matrix, topic_vec)

In [20]:
display(exp_pr_df.loc[['e1', 'e2', 'e3', 'e4']])
display(doc_pr_df.loc[['d1', 'd2', 'd3', 'd4']])

Unnamed: 0,question answer system,computational linguistics,clustering method,data mining
e1,0.454545,0.666667,1.133333,0.745455
e2,0.454545,0.0,0.75,0.795455
e3,0.0,0.0,1.55,0.45
e4,0.0,0.666667,0.333333,0.0


Unnamed: 0,question answer system,computational linguistics,clustering method,data mining
d1,0.454545,0.0,0.0,0.545455
d2,0.0,0.0,0.75,0.25
d3,0.0,0.666667,0.333333,0.0
d4,0.0,0.0,0.8,0.2


## Counted vectors

In [21]:
def cv_pipeline(ed_matrix, ed_graph):
        # Generate CV expert-document
    exp_vec, doc_vec = generator.generate_ed_vector(ed_matrix, ed_graph)
    
    return exp_vec, doc_vec

In [22]:
ed_count, de_count = cv_pipeline(ed_matrix, ed_graph)

In [23]:
display(ed_count)
display(de_count)

Unnamed: 0,count
d1,1
e1,3
e2,2
d2,1
e4,1
d3,1
e3,2
d4,1


Unnamed: 0,count
d1,2
e1,1
e2,1
d2,2
e4,1
d3,2
e3,1
d4,2


# ExpFinder algorithm

The algorithm runs on 4 experts, 4 documents and 4 topics in 5 iterations with $\lambda_{x} = 1.0$ and $\lambda_{d} = 0.7$. Figure 1 shows the directed bipartite graph from documents to experts.

![Expert-Document graph](https://github.com/Yongbinkang/ExpFinder/blob/main/images/EP_graph.png?raw=true)
<center>
    Figure 1: <i>The directed bipartite graph from documents to experts.</i>
</center>

In [25]:
def ef_pipeline(ed_matrix, ed_graph, exp_pr_df, doc_pr_df, ed_count, de_count):
    # Intialise parameters
    params = {
        'ed_graph': ed_graph,
        'ed_matrix': ed_matrix,
        'et_matrix': exp_pr_df,
        'dt_matrix': doc_pr_df,
        'lamb_e': 1.0,
        'lamb_d': 0.7,
        'max_iter': 5,
        'ed_count': ed_count,
        'de_count': de_count
    }
    topics = doc_pr_df.columns
    
    # Run model
    etop_matrix = trainer.run_expfinder(topics, params)
    
    display(etop_matrix)

In [26]:
ef_pipeline(ed_matrix, ed_graph, exp_pr_df, doc_pr_df, ed_count, de_count)

Unnamed: 0,e1,e2,e4,e3
question answer system,0.270876,0.351973,0.893352,0.068286
computational linguistics,0.273925,0.348171,0.893526,0.073161
clustering method,0.266421,0.34659,0.896962,0.065992
data mining,0.268491,0.350533,0.89477,0.066533
