In [None]:
#importing necessary libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import matplotlib as rcParams
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
#Load metadata.csv in the CORD-19 dataset and save it into a dataframe 

meta_data = pd.read_csv('filepath/metadata.csv')


In [None]:
#info about the dataframe
meta_data.shape

In [None]:
#Preprocessing the data

#handling duplicate data (based on 'sha','title' and 'abstract')
print(meta_data[meta_data.duplicated(subset=['sha','title','abstract'], keep=False) == True])
meta_data.drop_duplicates(subset=['sha','title','abstract'],keep ='last',inplace=True)
print('Data Size after dropping duplicated data (based on abstract attribute):',meta_data.shape)

In [None]:
#function to deal with null values
#'No Information Available' will be replaced 
def dealing_with_null_values(dataset):
    dataset = dataset
    for i in dataset.columns:
        replace = []
        data  = dataset[i].isnull()
        count = 0
        for j,k in zip(data,dataset[i]):
            if (j==True):
                count = count+1
                replace.append('No Information Available')
            else:
                replace.append(k)
        print("Num of null values (",i,"):",count)
        dataset[i] = replace
    return dataset

meta_data = dealing_with_null_values(meta_data)

In [None]:
#Document Vectorization using doc2vec from gensim

#importt gensim and doc2vec
import gensim
from gensim.models import Doc2Vec

def doc2vec():
    document_tagged = []
    tagged_count = 0
    for _ in meta_data['abstract'].values:
        document_tagged.append(gensim.models.doc2vec.TaggedDocument(_,[tagged_count]))
        tagged_count +=1 
    d2v = Doc2Vec(document_tagged)
    d2v.train(document_tagged,epochs=d2v.epochs,total_examples=d2v.corpus_count)
    return d2v.docvecs.vectors_docs


In [None]:


# data representation of doc2vec for abstract column.
# Visualization the doc2vec representation
%time doc2vec = doc2vec()



In [None]:
#heatmap for visualization
import seaborn as sns
plt.figure(figsize=(16,16))
sns.heatmap(doc2vec,cmap="coolwarm")


In [None]:
#for clustering the documents we use Kmeans as our clustering algorithm

# importing KMeans library of sklearn
from sklearn.cluster import KMeans

def kmeans(n_clusters):
    kmean_model = KMeans(n_clusters = n_clusters,random_state=0)
    return kmean_model



In [None]:
X = doc2vec
kmeans5 = KMeans(5)

%time km5 = kmeans5.fit_predict(X)

In [None]:
kmeans6 = KMeans(6)
%time km6 = kmeans6.fit_predict(X)


In [None]:
kmeans7 = KMeans(7)
%time km7 = kmeans7.fit_predict(X)


In [None]:
kmeans10 = KMeans(10)
%time km10 = kmeans10.fit_predict(X)

In [None]:
kmeans4= KMeans(4)
%time km4 = kmeans4.fit_predict(X)

In [None]:
kmeans3= KMeans(3)
%time km3 = kmeans3.fit_predict(X)

In [None]:
kmeans2= KMeans(2)
%time km2 = kmeans2.fit_predict(X)

In [None]:
kmeans1= KMeans(1)
%time km1 = kmeans1.fit_predict(X)

In [None]:
#Further analysis of clusters
models = [kmeans1, kmeans2, kmeans3,kmeans4, kmeans5, kmeans6, kmeans7, kmeans10]
def plot_WCSS_BCSS(models, data):
    fig, ax = plt.subplots(1, 2, figsize=(12,5))
    
    ## Plot WCSS
    wcss = [mod.inertia_ for mod in models]
    n_clusts = [1,2, 3,4, 5,6,7, 10]
    
    ax[0].bar(n_clusts, wcss,color='orange', edgecolor='black', linewidth=1)
    ax[0].set_xlabel('Number of clusters')
    ax[0].set_ylabel('WCSS')
    ax[0].set_title('Within Cluster Analysis')
    
    
    ## Plot BCSS 
    n_1 = (float(data.shape[0]) * float(data.shape[1])) - 1.0
    tss = n_1 * np.var(data)
    bcss = [tss - x for x in wcss]
    ax[1].bar(n_clusts, bcss,edgecolor='black')
    ax[1].set_xlabel('Number of clusters')
    ax[1].set_ylabel('BCSS')
    ax[1].set_title('Between Cluster Analysis')
    plt.show()
    

plot_WCSS_BCSS(models,X)


In [None]:
#Calculating Silhouette coefficients for choosing the number of clusters for our model
from sklearn.metrics import silhouette_score

def plot_silhouette(kms,data,nclusts):
    
    silhouette = []
    for i in kms:
        score = silhouette_score(data,i)
        print(score)
        silhouette.append(score)
    
    
    plt.bar(nclusts, silhouette,color = 'green')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.show()



In [None]:
#https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
%time plot_silhouette([km1, km2, km3, km4,km5,km6,km7, km10],X,[1,2,3,4,5,6,7,10])

In [None]:
%time plot_silhouette([km4,km7,km10],X,[4,7,10])

In [None]:
%time plot_silhouette([km3,km4,km5],X,[3,4,5])

In [None]:
%time plot_silhouette([km2],X,[2])

In [None]:
%time plot_silhouette([km1],X,[1])

In [None]:

# Silhoute constant of 2 clusters came out closed to +1
meta_data['cluster_doc2vec_kmeans2'] = kmeans2.labels_
pca = PCA(n_components=2).fit(X)
data = pca.transform(X)
centroids =  pca.transform(kmeans2.cluster_centers_)


In [None]:
pca = PCA(n_components=2).fit(X)
data = pca.transform(X)
centroids =  pca.transform(kmeans2.cluster_centers_)
plt.scatter(data[:, 0], data[:, 1],c = color)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#000000')
plt.title("Doc2Vec Matrix with 2 clusters_2Dimension")
plt.show()

In [None]:
dataframe =pd.DataFrame()
dataframe['cluster'] = meta_data['cluster_doc2vec_kmeans2']
dataframe['x'] =data[:, 0]
dataframe['y'] =data[:, 1]

In [None]:
dataframe.dropna(inplace=True)


In [None]:
#final dataframe after clustering
dataframe

In [None]:
#We are using ludwig text classificattion to train our classify our model according to the two clusters.
#We used parallel-cnn as our encoder
#input were the abstracts of all the papers from the dataset and classifies based on cluster number determined. 
!ludwig experiment \
  --data_csv datframe_with_two_clusters.csv \
  --model_definition_file model_definition_cord.yaml

In [None]:
#accuracy of training set = 88.1%
#accuracy of validation set = 74.1%
#accuracy of test set = 74.4%


In [None]:
#visualizations
!ludwig visualize -v learning_curves --training_statistics results/experiment_run_3/training_statistics.json


![title](img/Training_clusters_hitsatk.png)

![title](img/Training_clusters.png)

![title](img/training_clsuters_combines.png)

![title](img/Training_Clusters.png)

![title](img/training_clusters_combined.png)