In [None]:
### If you have the files on Google Drive
# We recommend using Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### Change the path to point to the source directory
# %cd /YOUR_PATH/src
%cd ./src

/content/drive/.shortcut-targets-by-id/1VwEes_n1K_s8MWto1dlkl_2daHsitkQi/Fidelity/src


In [None]:
### Check that you are in the correct working directory
!pwd

/content/drive/.shortcut-targets-by-id/1VwEes_n1K_s8MWto1dlkl_2daHsitkQi/Fidelity/src


In [None]:
### STEP 1 - Convert CSV to JSON
import csv
import json


# Function to convert a CSV to JSON
# Takes the file paths as arguments
def make_json(csvFilePath, jsonFilePath):
     
    # create a dictionary
    data = {}
     
    # Open a csv reader called DictReader
    with open(csvFilePath, encoding='utf-8-sig') as csvf:
        csvReader = csv.DictReader(csvf)
         
        # Convert each row into a dictionary
        # and add it to data
        for rows in csvReader:
             
            # Assuming a column named 'PaperID' to
            # be the primary key -> Double check in the csv file that you load
            key = rows['PaperID']
            data[key] = rows
 
    # Open a json writer, and use the json.dumps()
    # function to dump data
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
        jsonf.write(json.dumps(data, indent=4))

        
# Decide the two file paths according to your
# computer system
# WARNING: Make sure these files exist and are pointing to the right location
csvFilePath = r'../data/vhfc_vh_cluster_data_N32934.csv'
jsonFilePath = r'../data/vh_fidelity.json'
 
# Call the make_json function
make_json(csvFilePath, jsonFilePath)

FileNotFoundError: ignored

In [None]:
# Install dependencies
!pip install transformers

In [None]:
!pip install plotly

In [None]:
!pip install plotly-express

In [None]:
!pip install pyyaml==5.4.1

In [None]:
# check if GPU/CUDA is available
# WARNING: Under the current setup a GPU is required! Please make sure that you have access to one.
!nvidia-smi

In [None]:
### STEP 2 - Get the SPECTER Embeddings
# Run in terminal to get embeddings
!CUDA_VISIBLE_DEVICES=0 python ./embed_papers_hf.py --data-path ../data/vh_fidelity.json --output ../data/vh_fidelity_embeddings.json

In [None]:
### STEP 3 - Import the data to pandas dataframe
import pandas as pd

# Specter embeddings
df_embeddings = pd.read_json(r'../data/vh_fidelity_embeddings.json', lines=True)

# Meta data
df_meta_data = pd.read_json(r'../data/vh_fidelity.json', orient='records').T
df_meta_data = df_meta_data.set_index('PaperID')
df_meta_data.index = df_meta_data.index.astype('int64')

# prep the embeddings to be merged with meta data
df_embeddings_list = pd.DataFrame(df_embeddings.embedding.tolist())
df_embeddings = pd.concat([df_embeddings.Key, df_embeddings_list], axis=1)
df_embeddings = df_embeddings.set_index('Key')

In [None]:
# check that everything looks good
df_embeddings


In [None]:
### STEP 4 - Combine the meta data and embeddings
df_data = pd.concat([df_meta_data, df_embeddings], axis=1, join='inner')
df_data

In [None]:
### STEP 5 - Get the tSNE 2d projections
from sklearn.manifold import TSNE

skip_factor = 1
features = df_embeddings.iloc[::skip_factor, :]
tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)
df_projections = pd.DataFrame({'tSNE1': projections[:, 0], 'tSNE2': projections[:, 1]})
df_projections_meta = df_data.iloc[::skip_factor, :]
df_projections_meta = df_projections_meta.reset_index()
df_embeddings_proj = pd.concat([df_projections_meta, df_projections], axis = 1)

In [None]:
### STEP 6 - Save the TSNE projections + rest of data
df_embeddings_proj.to_csv(r'../data/fidelity_tsne.csv')

In [None]:
# Load the data to skip the initial steps if interested in alternative clustering and analysis methods
# df_embeddings_proj= pd.read_csv(r'../data/fidelity_tsne.csv')

In [None]:
# check everything looks good
df_embeddings_proj

In [None]:
### STEP 7 - Visualize and Save
import numpy as np
import plotly.express as px

# remove empty records
df_embeddings_proj["Title"].replace('', np.nan, inplace=True)
df_embeddings_proj.dropna(subset=["Title"], inplace=True)

fig = px.scatter(
    df_embeddings_proj, x='tSNE1', y='tSNE2', 
    color = "Published Year", hover_data = ['Title']
)
fig.show()
# save interactive website
fig.write_html("../data/fidelity_data_visualization.html")

In [None]:
### STEP 8 - Identify the optimal k for k-means clustering
# Import kMeans
from sklearn.cluster import KMeans
# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer

np.random.seed(0)

model = KMeans()
# Make sure that this is the right index where the embeddings start in df_embeddings_proj -> Look for column with label '0'
embedding_start_idx = 13
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,100), timings= True)
df_elbow = df_embeddings_proj.iloc[:, embedding_start_idx:embedding_start_idx+768]
visualizer.fit(df_elbow.astype('float'))        # Fit data to visualizer
visualizer.show(outpath="../data/fidelity_elbow.pdf")        # Finalize and render figure

In [None]:
# Check that embeddings_start_idx is correct!
embedding_start_idx = 13
df_embeddings_proj.iloc[:, embedding_start_idx:embedding_start_idx+768]

In [None]:
### STEP 9 - Finalize Clustering with Optimal k = X
np.random.seed(0)
# Optimal k from above figure in Elbow plot
k = 29
km = KMeans(n_clusters=k)
df_embeddings_proj['ClusterID'] = km.fit_predict(df_embeddings_proj.iloc[:, embedding_start_idx:embedding_start_idx+768])

In [None]:
# Check that you added a last column with ClusterID
df_embeddings_proj.head()

In [None]:
# save the data
df_embeddings_proj.to_csv(r'../data/fidelity_cluster_data.csv')

In [None]:
### STEP 10 - Visualize and Save
import numpy as np
import plotly.express as px

df_embeddings_proj = df_embeddings_proj.sort_values(by=['ClusterID'])
df_embeddings_proj["ClusterID"] = df_embeddings_proj["ClusterID"].astype('category')


fig = px.scatter(
    df_embeddings_proj, x='tSNE1', y='tSNE2', 
    color = 'ClusterID', hover_data = ['Title'],
    template='simple_white'
)
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
fig.write_html("../data/fidelity_clusters.html")

In [None]:
# Derive and save wordclouds
# Wordclouds for Title and Abstracts
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import pandas as pd

# identify a set of stopwords that need to be removed to improve wordcloud quality
stop_words = ["virtual", "human", "agent", "ECA", "ECAs", "conversational", "humans", "agents", "user"] + list(STOPWORDS)

search_categories = ['Title', 'Abstract']

for search_category in search_categories:
    for cluster_ID in range(k):
        temp_df = df_embeddings_proj.loc[df_embeddings_proj['ClusterID'] == cluster_ID]
        # Create and generate a word cloud image:
        all_text = ' '.join(temp_df[search_category])
        wordcloud = WordCloud(width=1600, height=800, stopwords = stop_words, background_color="white").generate(all_text)
        wordcloud.to_file(r'../data/wordcloud/cluster_{}_{}.png'.format(cluster_ID, search_category))

In [None]:
# Wordclouds for Authors
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import pandas as pd

# Remove single letters and "Anonymous" as stopwords as well
stop_words = list(map(chr, range(97, 123))) + ["Anonymous"] + list(STOPWORDS)

search_categories = ['Authors']

for search_category in search_categories:
    for cluster_ID in range(k):
        temp_df = df_embeddings_proj.loc[df_embeddings_proj['ClusterID'] == cluster_ID]
        # Create and generate a word cloud image:
        all_text = ' '.join(temp_df[search_category])
        wordcloud = WordCloud(width=1600, height=800, stopwords = stop_words, background_color="white").generate(all_text)
        wordcloud.to_file(r'../data/wordcloud/cluster_{}_{}.png'.format(cluster_ID, search_category))

In [None]:
# OPTIONAL - Automated Topic Identification
!pip install pyate 

In [None]:
!pip install -U spacy

In [None]:
!spacy download en_core_web_sm

In [None]:
# Automatically derive cluster topics
import spacy
from pyate.term_extraction_pipeline import TermExtractionPipeline
stop_words = list(STOPWORDS)

search_categories = ['Abstract']

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("combo_basic")

for search_category in search_categories:
    for cluster_ID in range(k):
        print('--- ClusterID ' + str(cluster_ID) + ' ---')
        temp_df = df_embeddings_proj.loc[df_embeddings_proj['ClusterID'] == cluster_ID]
        all_text = '; '.join(temp_df[search_category])
        doc = nlp(all_text)
        print(doc._.combo_basic.sort_values(ascending=False).head(5))