In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import ast
import json

In [2]:
sent_df = pd.read_csv('./data/test_sentence_embeddings.csv')
sent_df['sent_transf_emb'] = sent_df['sent_transf_emb'].apply(ast.literal_eval)
sent_df['word2vec_emb'] = sent_df['word2vec_emb'].apply(ast.literal_eval)

with open('./data/categories.json', 'r') as f:
    categ_id = json.load(f)

paper_ids = []
categs = []
paper_doc2vec_embs = []
paper_sent_transf_embs = []
paper_word2vec_embs = []


for p_id in tqdm(set(sent_df.paper_id.values)):
    paper_df = sent_df[sent_df.paper_id == p_id]
    paper_ids.append(p_id)
    categs.append(paper_df.iloc[0].category)
    paper_doc2vec_embs.append(list(np.mean(np.array([list(i) for i in paper_df.doc2vec_emb.values]), axis = 0)))
    paper_sent_transf_embs.append(list(np.mean(np.array([list(i) for i in paper_df.sent_transf_emb.values]), axis = 0)))
    paper_word2vec_embs.append(list(np.mean(np.array([list(i) for i in paper_df.word2vec_emb.values]), axis = 0)))
    

paper_df = pd.DataFrame({'paper_id': paper_ids, 'category': categs, 'categ_id': [categ_id[i] for i in categs],
                         'word2vec_emb': paper_word2vec_embs}).sort_values(['category', 'paper_id'])

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 39.18it/s]


## LSA results

In [12]:
summaries = {}

with open('./results/lsa_summaries.txt', 'r', encoding = 'utf-8') as f:
    for i in f.readlines():
        p_id = i.strip().split('\t')[0]
        summ = i.strip().split('\t')[1]
        summaries[p_id] = summ

similarity_scores = []
for paper_id in summaries.keys():
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = summaries[paper_id]

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.7932863180239611

### Word2vec + Equal clusters

In [40]:
summaries = {}

with open('./results/clust_summaries_word2vec_equal_clusters.txt', 'r', encoding = 'utf-8') as f:
    for i in f.readlines():
        p_id = i.strip().split('\t')[0]
        summ = i.strip().split('\t')[1]
        summaries[p_id] = summ

similarity_scores = []
for paper_id in summaries.keys():
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = summaries[paper_id]

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.8913951508956429

### Word2vec + Top N closest

In [41]:
summaries = {}

with open('./results/clust_summaries_word2vec_top_n.txt', 'r', encoding = 'utf-8') as f:
    for i in f.readlines():
        p_id = i.strip().split('\t')[0]
        summ = i.strip().split('\t')[1]
        summaries[p_id] = summ

similarity_scores = []
for paper_id in summaries.keys():
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = summaries[paper_id]

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.9008287849665815

## Query Based Summ. results

### Sentence Transformer

In [54]:
summ_df = pd.read_csv('./results/query_based_summ_sent_transf.csv')
summ_df['summary_by_closeness'] = summ_df['summary_by_closeness'].apply(ast.literal_eval)
summ_df['summary_by_frequency'] = summ_df['summary_by_frequency'].apply(ast.literal_eval)

In [55]:
similarity_scores = []
for paper_id in summ_df.paper_id.values:
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = ' '.join(summ_df[summ_df.paper_id == paper_id]['summary_by_closeness'].item().values())

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.9147987223628427

In [56]:
similarity_scores = []
for paper_id in summ_df.paper_id.values:
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = ' '.join(summ_df[summ_df.paper_id == paper_id]['summary_by_frequency'].item().values())

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.8744386886213783

### Word2vec

In [48]:
summ_df = pd.read_csv('./results/query_based_summ_word2vec.csv')
summ_df['summary_by_closeness'] = summ_df['summary_by_closeness'].apply(ast.literal_eval)
summ_df['summary_by_frequency'] = summ_df['summary_by_frequency'].apply(ast.literal_eval)

In [49]:
similarity_scores = []
for paper_id in summ_df.paper_id.values:
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = ' '.join(summ_df[summ_df.paper_id == paper_id]['summary_by_closeness'].item().values())

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.9173872327014787

In [50]:
similarity_scores = []
for paper_id in summ_df.paper_id.values:
    paper = sent_df[sent_df.paper_id == paper_id]
    summ_text = ' '.join(summ_df[summ_df.paper_id == paper_id]['summary_by_frequency'].item().values())

    paper_emb = paper_df[paper_df.paper_id == paper_id]['doc2vec_emb'].item()
    
    summ_emb_list = paper[paper.sentence.apply(lambda x: x in summ_text)]['doc2vec_emb'].values
    summ_emb = list(np.mean(np.array([list(i) for i in summ_emb_list]), axis = 0))

    similarity_scores.append(cosine_similarity([paper_emb], [summ_emb]).item())
    
np.mean(similarity_scores)

0.8713999620499607