In [1]:
USE_SERVER = True
if USE_SERVER:
    !pip install elasticsearch
    !pip install elasticsearch_dsl
    !pip install pymed
    !pip install gensim
    !pip install torchsummary



In [13]:
from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image
import pandas as pd
import utils
import os
from utils import PROJECT_ROOT, DATA_PATH
import yuval_module.paper_source as PaperSource

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import py_4.get_mesh_vec as get_mesh_vec
import py_3.sim_matrix_3 as sim_matrix_3
import py_4.get_all_features as get_all_features 
from sklearn.cluster import DBSCAN as DBS

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import metric_eval_2


from torch.utils.data import Dataset, DataLoader

from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import time
import pickle

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm_notebook

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
mesh_embed=get_mesh_vec.MeshEmbeddings(PROJECT_ROOT + "data/mesh_data/MeSHFeatureGeneratedByDeepWalk.csv")

In [4]:
FILE = "enriched_labeled_dataset_large.json" 
if os.path.exists(PROJECT_ROOT + DATA_PATH + FILE):
    print("READING FROM LOCAL")
    if FILE.split(".")[1] == "json":
        df = pd.read_json(PROJECT_ROOT + DATA_PATH + FILE)
    else:
        df = pd.read_csv(PROJECT_ROOT + DATA_PATH + FILE)
    #ps = PaperSource()
else:
    print("PULLING FROM S3")
    ps = sim_matrix_3.load_dataset(FILE)
    df = ps.get_dataset()

df.drop(columns=["last_author_country"],inplace=True)
df.rename(columns={'ORG_STATE':'last_author_country'},inplace=True)

print("FILE PULLED")

READING FROM LOCAL
FILE PULLED


In [5]:
cd ..

/home/ubuntu/AYP


In [5]:
class VAE_A(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim):
        super(VAE_A, self).__init__()
        self.input_dim = input_dim
        self.hd = hidden_dim
        self.latent_dim = latent_dim
         
        self.fc1 = nn.Linear(self.input_dim, self.hd)
        self.fc21 = nn.Linear(self.hd, self.latent_dim)
        self.fc22 = nn.Linear(self.hd, self.latent_dim)
        self.fc3 = nn.Linear(self.latent_dim, self.hd)
        self.fc4 = nn.Linear(self.hd,self.input_dim)
        
        self.dropout = torch.nn.Dropout(p=0.4) 
        self.bn1 = nn.BatchNorm1d(self.hd)
        self.bn5 = nn.BatchNorm1d(self.hd)

    def encode(self, x):
        h = F.relu(self.fc1(x))
        return self.fc21(h), self.fc22(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu) # return z sample
        

    def decode(self, z):
        h = F.relu(self.fc3(z))
        return self.fc4(h)

    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, self.input_dim))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [6]:
class ToyDS(Dataset):
    def __init__(self,df,selection,vae_features = None):
        super().__init__()
        self.df = df[df['last_author_name'].isin(selection)]
        if vae_features is None:
            print("Creating new VAE FEATURES")
            self.vae_features = get_all_features.VAE_Features(self.df)
        else:
            print("Using pre-defined VAE FEATURES")
            self.vae_features = vae_features
        self.features = self.vae_features.get_all_features(self.df)
        print(list(self.vae_features.mesh_features.mesh_missing))
        self.input_dim = self.vae_features.input_dims
            
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        features = self.features[idx]
        return features
    
    def __getvae__(self):
        return self.vae_features

In [8]:
MODEL_PATH = "../code/models/vae_a_8dim_allfeat.pt"

model = torch.load(MODEL_PATH)
model.eval()

VAE_A(
  (fc1): Linear(in_features=1102, out_features=32, bias=True)
  (fc21): Linear(in_features=32, out_features=8, bias=True)
  (fc22): Linear(in_features=32, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=1102, bias=True)
)

In [9]:
with open('../code/models/vae_allfeat.pkl', 'rb') as input:
    vae_feat = pickle.load(input)

In [10]:
auth_eps = pd.read_csv("../data/val_set_author_names.csv")["0"]
auth_usecase = pd.read_csv("../data/test_set_author_names.csv")["0"]

### HYPERPARAMETER OPTIMIZATION ###

In [11]:
#settings

BATCH_SIZE= 32
EPOCHS = 30
cuda = torch.cuda.is_available()
seed = 42
log_interval = 10
num_workers = 2

#check for cuda
device = torch.device('cuda' if cuda else 'cpu')

torch.manual_seed(seed)
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

device

device(type='cpu')

In [12]:
df_all_cases = []

for auth in auth_eps:
    df_auth = df[df['last_author_name'] == auth] 
    data = ToyDS(df,[auth], vae_features = vae_feat)
    data = torch.from_numpy(data.features).to(device,dtype=torch.float32)
    _, mu, __ = model(data)
    latent_feat = mu.detach().cpu().numpy()
    df_all_cases.append([df_auth,latent_feat])

 pregnanediol', 'MESH NAME NOT FOUND: agmatine', 'MESH NAME NOT FOUND: catfishes', 'MESH NAME NOT FOUND: water pipe smoking', 'MESH NAME NOT FOUND: crystalloid solutions', 'MESH NAME NOT FOUND: lymphatic irradiation', 'MESH NAME NOT FOUND: mobile health units', 'MESH NAME NOT FOUND: human migration', 'MESH NAME NOT FOUND: household articles', 'MESH NAME NOT FOUND: dithizone', 'MESH NAME NOT FOUND: agouti signaling protein', 'MESH NAME NOT FOUND: abortion, therapeutic', 'MESH NAME NOT FOUND: gene rearrangement, delta-chain t-cell antigen receptor', 'MESH NAME NOT FOUND: neoplasms, fibrous tissue', 'MESH NAME NOT FOUND: computer user training', 'MESH NAME NOT FOUND: war-related injuries', 'MESH NAME NOT FOUND: type vii secretion systems', 'MESH NAME NOT FOUND: venae cavae', 'MESH NAME NOT FOUND: rectal diseases', 'MESH NAME NOT FOUND: oman', 'MESH NAME NOT FOUND: professional practice gaps', 'MESH NAME NOT FOUND: nuclear medicine', 'MESH NAME NOT FOUND: animal nutrition sciences', 'MESH 

In [18]:
db__eps = np.linspace(.40,.60,15)

best_eps = None
best_f1 = 0.0

sil_score_log = {}

for db_eps in tqdm_notebook(db__eps):
        
    y_hat_comb = []

    for case in df_all_cases:
        df_clus, latent_feat = case
        y_hat = DBS(eps=db_eps,min_samples=1,metric="euclidean").fit(latent_feat)
        df_clus = df_clus[["pmid","PI_IDS"]]
        df_clus['cluster_pred'] = y_hat.labels_
        y_hat_comb.append(df_clus)

        
    f1, precision, recall, df_eval = metric_eval_2.get_metrics_many(y_hat_comb)

    if f1 > best_f1:
        best_f1 = f1
        best_eps = db_eps

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

  21
Num Clusters:  20
Num Unique Authors:  3
Precision:  1.0
Recall:  0.19047619047619047
               mis_integration  mis_separation
3 cluster(s)                 1               1
7 cluster(s)                 1               1
10 cluster(s)                1               1

-------------------

Situation 51
Num Papers:  18
Num Clusters:  17
Num Unique Authors:  3
Precision:  1.0
Recall:  0.16666666666666666
              mis_integration  mis_separation
3 cluster(s)                1               1
6 cluster(s)                1               1
9 cluster(s)                1               1

-------------------

Situation 52
Num Papers:  17
Num Clusters:  16
Num Unique Authors:  3
Precision:  1.0
Recall:  0.23529411764705882
              mis_integration  mis_separation
3 cluster(s)                1               1
4 cluster(s)                1               1
9 cluster(s)                1               1

-------------------

Situation 53
Num Papers:  42
Num Clusters:  42
Num Unique

In [19]:
print(best_eps)
print(best_f1)

0.5857142857142856
0.25559680886133246


### GET SCORES FROM USE CASES

In [19]:
def pipeline_vae_db(df,auth_df,authors,use_case,eps):
    
    if use_case == "1_da" or use_case == "mix_bag":
        authors = sim_matrix_3.get_use_case(df,use_case)
    else:
        auth_df = df[df['last_author_name'].isin(authors)]   
        authors = sim_matrix_3.get_use_case(auth_df,use_case)
        
    num_cases = len(authors)
    
    df_all_cases = []
    all_papers = []

    for i,auth in enumerate(authors):
        print("Processing combination number {} from {}".format(i+1,num_cases))
        df_auth = df[df['last_author_name'] == auth]
        all_papers.append(df_auth.shape[0])
        #Calculate the distance matrix
        
        df_auth = df[df['last_author_name'] == auth] 
        data = ToyDS(df, [auth], vae_features = vae_feat)
        data = torch.from_numpy(data.features).to(device,dtype=torch.float32)
        _, mu, __ = model(data)
        latent_feat = mu.detach().cpu().numpy()
        df_all_cases.append([df_auth,latent_feat])

    y_hat_comb = []
    
    for case in df_all_cases:
        df_clus, latent_feat = case
        y_hat = DBS(eps=db_eps,min_samples=1,metric="euclidean").fit(latent_feat)
        df_clus = df_clus[["pmid","PI_IDS"]]
        df_clus['cluster_pred'] = y_hat.labels_
        y_hat_comb.append(df_clus)
    
    return y_hat_comb, num_cases, np.mean(np.array(all_papers))

In [30]:
def pipeline_vae_kmeans(df,auth_df,authors,use_case):
    
    if use_case == "1_da" or use_case == "mix_bag":
        authors = sim_matrix_3.get_use_case(df,use_case)
    else:
        auth_df = df[df['last_author_name'].isin(authors)]   
        authors = sim_matrix_3.get_use_case(auth_df,use_case)
        
    num_cases = len(authors)
    
    df_all_cases = []
    all_papers = []

    for i,auth in enumerate(authors):
        print("Processing combination number {} from {}".format(i+1,num_cases))
        df_auth = df[df['last_author_name'] == auth]
        all_papers.append(df_auth.shape[0])
        #Calculate the distance matrix
        
        df_auth = df[df['last_author_name'] == auth] 
        data = ToyDS(df, [auth], vae_features = vae_feat)
        data = torch.from_numpy(data.features).to(device,dtype=torch.float32)
        _, mu, __ = model(data)
        latent_feat = mu.detach().cpu().numpy()
        df_all_cases.append([df_auth,latent_feat])

    y_hat_comb = []
    
    for case in df_all_cases:
        df_clus, latent_feat = case
        
        
        
#         y_hat = DBS(eps=db_eps,min_samples=1,metric="euclidean").fit(latent_feat)
        range_n_clusters = np.arange(1,11)
        best_score = 0.0
        best_n = None
        best_labels = None
        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, random_state=10)
            cluster_labels = clusterer.fit_predict(latent_feat)
            print(cluster_labels)
            silhouette_avg = silhouette_score(latent_feat, cluster_labels)
            sil_score_log[n_clusters] = silhouette_avg
            if silhouette_avg > best_score:
                best_score = silhouette_avg
                best_n = n_clusters
                best_labels = cluster_labels
        
        df_clus = df_clus[["pmid","PI_IDS"]]
        df_clus['cluster_pred'] = best_labels.labels_
        y_hat_comb.append(df_clus)
    
    return y_hat_comb, num_cases, np.mean(np.array(all_papers))

In [28]:
usecase = list(pd.read_csv("./data/test_set_author_names.csv").drop(columns=["Unnamed: 0"])["0"])
df_usecase = df[df['last_author_name'].isin(usecase)]

In [31]:
total_metric = []
use_cases = ['2_da_same','2_da_dif','3_da', '1_da','mix_bag']

for case in use_cases:
    y_hat_comb, num_cases, num_papers = pipeline_vae_kmeans(df,df_usecase,authors=usecase,use_case=case)
    f1, precision, recall, df_eval = metric_eval_2.get_metrics_many(y_hat_comb)
    total_metric.append([case,f1,precision,recall,df_eval, num_cases, num_papers])

Processing combination number 1 from 13
Using pre-defined VAE FEATURES
['MESH NAME NOT FOUND: Male', 'MESH NAME NOT FOUND: Reinforcement, Psychology', 'MESH NAME NOT FOUND: Neural Networks, Computer', 'MESH NAME NOT FOUND: Claustrum', 'MESH NAME NOT FOUND: Illicit Drugs', 'MESH NAME NOT FOUND: Cyclophilin D', "MESH NAME NOT FOUND: Practice Patterns, Physicians'", 'MESH NAME NOT FOUND: Pyruvate Dehydrogenase (Acetyl-Transferring) Kinase', 'MESH NAME NOT FOUND: Psychology, Positive', 'MESH NAME NOT FOUND: Copper-Transporting ATPases', 'MESH NAME NOT FOUND: Retention, Psychology', 'MESH NAME NOT FOUND: CSK Tyrosine-Protein Kinase', 'MESH NAME NOT FOUND: Inhibition, Psychological', 'MESH NAME NOT FOUND: Chlorocebus aethiops', 'MESH NAME NOT FOUND: Recognition, Psychology', 'MESH NAME NOT FOUND: Lactate Dehydrogenase 5', 'MESH NAME NOT FOUND: Early Intervention, Educational', 'MESH NAME NOT FOUND: Broadly Neutralizing Antibodies', 'MESH NAME NOT FOUND: Heterogeneous Nuclear Ribonucleoprotei

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in 

Using pre-defined VAE FEATURES
['MESH NAME NOT FOUND: Male', 'MESH NAME NOT FOUND: Reinforcement, Psychology', 'MESH NAME NOT FOUND: Neural Networks, Computer', 'MESH NAME NOT FOUND: Claustrum', 'MESH NAME NOT FOUND: Illicit Drugs', 'MESH NAME NOT FOUND: Cyclophilin D', "MESH NAME NOT FOUND: Practice Patterns, Physicians'", 'MESH NAME NOT FOUND: Pyruvate Dehydrogenase (Acetyl-Transferring) Kinase', 'MESH NAME NOT FOUND: Psychology, Positive', 'MESH NAME NOT FOUND: Copper-Transporting ATPases', 'MESH NAME NOT FOUND: Retention, Psychology', 'MESH NAME NOT FOUND: CSK Tyrosine-Protein Kinase', 'MESH NAME NOT FOUND: Inhibition, Psychological', 'MESH NAME NOT FOUND: Chlorocebus aethiops', 'MESH NAME NOT FOUND: Recognition, Psychology', 'MESH NAME NOT FOUND: Lactate Dehydrogenase 5', 'MESH NAME NOT FOUND: Early Intervention, Educational', 'MESH NAME NOT FOUND: Broadly Neutralizing Antibodies', 'MESH NAME NOT FOUND: Heterogeneous Nuclear Ribonucleoprotein D0', "MESH NAME NOT FOUND: Practice Pa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in 

Using pre-defined VAE FEATURES
['MESH NAME NOT FOUND: Male', 'MESH NAME NOT FOUND: Reinforcement, Psychology', 'MESH NAME NOT FOUND: Neural Networks, Computer', 'MESH NAME NOT FOUND: Claustrum', 'MESH NAME NOT FOUND: Illicit Drugs', 'MESH NAME NOT FOUND: Cyclophilin D', "MESH NAME NOT FOUND: Practice Patterns, Physicians'", 'MESH NAME NOT FOUND: Pyruvate Dehydrogenase (Acetyl-Transferring) Kinase', 'MESH NAME NOT FOUND: Psychology, Positive', 'MESH NAME NOT FOUND: Copper-Transporting ATPases', 'MESH NAME NOT FOUND: Retention, Psychology', 'MESH NAME NOT FOUND: CSK Tyrosine-Protein Kinase', 'MESH NAME NOT FOUND: Inhibition, Psychological', 'MESH NAME NOT FOUND: Chlorocebus aethiops', 'MESH NAME NOT FOUND: Recognition, Psychology', 'MESH NAME NOT FOUND: Lactate Dehydrogenase 5', 'MESH NAME NOT FOUND: Early Intervention, Educational', 'MESH NAME NOT FOUND: Broadly Neutralizing Antibodies', 'MESH NAME NOT FOUND: Heterogeneous Nuclear Ribonucleoprotein D0', "MESH NAME NOT FOUND: Practice Pa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in 

Using pre-defined VAE FEATURES
['MESH NAME NOT FOUND: Male', 'MESH NAME NOT FOUND: Reinforcement, Psychology', 'MESH NAME NOT FOUND: Neural Networks, Computer', 'MESH NAME NOT FOUND: Claustrum', 'MESH NAME NOT FOUND: Illicit Drugs', 'MESH NAME NOT FOUND: Cyclophilin D', "MESH NAME NOT FOUND: Practice Patterns, Physicians'", 'MESH NAME NOT FOUND: Pyruvate Dehydrogenase (Acetyl-Transferring) Kinase', 'MESH NAME NOT FOUND: Psychology, Positive', 'MESH NAME NOT FOUND: Copper-Transporting ATPases', 'MESH NAME NOT FOUND: Retention, Psychology', 'MESH NAME NOT FOUND: CSK Tyrosine-Protein Kinase', 'MESH NAME NOT FOUND: Inhibition, Psychological', 'MESH NAME NOT FOUND: Chlorocebus aethiops', 'MESH NAME NOT FOUND: Recognition, Psychology', 'MESH NAME NOT FOUND: Lactate Dehydrogenase 5', 'MESH NAME NOT FOUND: Early Intervention, Educational', 'MESH NAME NOT FOUND: Broadly Neutralizing Antibodies', 'MESH NAME NOT FOUND: Heterogeneous Nuclear Ribonucleoprotein D0', "MESH NAME NOT FOUND: Practice Pa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in 

Using pre-defined VAE FEATURES
['MESH NAME NOT FOUND: Male', 'MESH NAME NOT FOUND: Reinforcement, Psychology', 'MESH NAME NOT FOUND: Neural Networks, Computer', 'MESH NAME NOT FOUND: Claustrum', 'MESH NAME NOT FOUND: Illicit Drugs', 'MESH NAME NOT FOUND: Cyclophilin D', "MESH NAME NOT FOUND: Practice Patterns, Physicians'", 'MESH NAME NOT FOUND: Pyruvate Dehydrogenase (Acetyl-Transferring) Kinase', 'MESH NAME NOT FOUND: Psychology, Positive', 'MESH NAME NOT FOUND: Copper-Transporting ATPases', 'MESH NAME NOT FOUND: Retention, Psychology', 'MESH NAME NOT FOUND: CSK Tyrosine-Protein Kinase', 'MESH NAME NOT FOUND: Inhibition, Psychological', 'MESH NAME NOT FOUND: Chlorocebus aethiops', 'MESH NAME NOT FOUND: Recognition, Psychology', 'MESH NAME NOT FOUND: Lactate Dehydrogenase 5', 'MESH NAME NOT FOUND: Early Intervention, Educational', 'MESH NAME NOT FOUND: Broadly Neutralizing Antibodies', 'MESH NAME NOT FOUND: Heterogeneous Nuclear Ribonucleoprotein D0', "MESH NAME NOT FOUND: Practice Pa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co_authors']=df.authors.apply( lambda x: [i['name'] for i in x] )


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [42]:
# %%capture cap --no-stderr
for val in total_metric:
    case, f1, prec, rec, df_eval, num_cases, num_papers = val
    print("CASE: {}\tNumber of authors: {}\tAvg. Number of papers: {}".format(case,num_cases,num_papers))
    print("\nF1-Score: {}\tTotal Precision: {}\tTotal Recall: {}".format(f1,prec,rec))
    print(df_eval)
    print("\n")
# with open('txt/test_scores_VAE_model.txt', 'w') as out:
#    out.write(cap.stdout)

CASE: 2_da_same	Number of authors: 13	Avg. Number of papers: 14.846153846153847

F1-Score: 0.34386121603832154	Total Precision: 0.9017271157167529	Total Recall: 0.21243523316062174
                 7 cluster(s)  8 cluster(s)
mis_integration           0.5           0.5
mis_separation            0.5           0.5


CASE: 2_da_dif	Number of authors: 29	Avg. Number of papers: 27.896551724137932

F1-Score: 0.3895097156165883	Total Precision: 0.9351619675476289	Total Recall: 0.24598269468479603
                 6 cluster(s)  22 cluster(s)
mis_integration      0.666667       0.333333
mis_separation       0.666667       0.333333


CASE: 3_da	Number of authors: 37	Avg. Number of papers: 26.486486486486488

F1-Score: 0.3983377817423265	Total Precision: 0.8834113128049125	Total Recall: 0.2571428571428572
                 2 cluster(s)  4 cluster(s)  7 cluster(s)
mis_integration      0.142857      0.714286      0.142857
mis_separation       0.142857      0.714286      0.142857


CASE: 1_da	Number o