In [None]:
import numpy as np
import zipfile
import pandas as pd
import glob
import json
import seaborn as sns
import spacy
import nltk
from IPython.core.display import HTML
from matplotlib import pyplot as plt

In [None]:
dados = pd.read_csv("Dados/corona_df_completo.csv")
dados.head()

In [None]:
dados.shape

In [None]:
dados.dropna()

In [None]:
dados_covid = dados.dropna()
dados_covid.shape

In [None]:
df = pd.set_option("display.max_colwidth", 100)
dados_covid.head()

# TF - IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

dataset = dados_covid["text"].tolist()
dataset

In [None]:
len(dataset)

In [None]:
2**12

In [None]:
tfidf = TfidfVectorizer(max_features = 2 ** 12)
vect = tfidf.fit_transform(dataset)
vect

# PCA

# Redução de dimensionalidade

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
x_pca = pca.fit_transform(vect.toarray())
comp = pca.explained_variance_ratio_
comp

In [None]:
x_pca.shape

In [None]:
sns.set(rc={'figure.figsize': (10,8)})

sns.scatterplot(x_pca[:,0], x_pca[:, 1])
plt.title('Covid-19 Papers', fontsize=20);

# K-Means

# Definição do número de clusters

In [None]:
from sklearn.cluster import MiniBatchKMeans

wcss = []

for i in range(1, 21):
    kmeans = MiniBatchKMeans(n_clusters = i, random_state = 0)
    kmeans.fit(vect)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 21), wcss)
plt.xlabel('Número de cluster')
plt.ylabel('WCSS')

In [None]:
k = 10
kmeans = MiniBatchKMeans(n_clusters = k)
y_pred = kmeans.fit_predict(vect)
y_pred

In [None]:
np.unique(y_pred)

In [None]:
len(y_pred)

In [None]:
palette = sns.color_palette('bright', len(set(y_pred)))
sns.scatterplot(x_pca[:,0], x_pca[:, 1], hue=y_pred, legend='full', palette=palette)
plt.title('Clustered Covid-19 Papers');

In [None]:
# Based on: https://www.kaggle.com/maksimeren/covid-19-literature-clustering

from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap
from bokeh.io import output_file, show
from bokeh.transform import transform
from bokeh.io import output_notebook
from bokeh.plotting import figure
from bokeh.layouts import column
from bokeh.models import RadioButtonGroup
from bokeh.models import TextInput
from bokeh.layouts import gridplot
from bokeh.models import Div
from bokeh.models import Paragraph
from bokeh.layouts import column, widgetbox

output_notebook()
y_labels = y_pred

# data sources
source = ColumnDataSource(data=dict(
    x= x_pca[:,0], 
    y= x_pca[:,1],
    x_backup = x_pca[:,0],
    y_backup = x_pca[:,1],
    desc= y_labels, 
    titles= dados['title'],
    abstract = dados['abstract'],
    labels = ["C-" + str(x) for x in y_labels]
    ))

# hover over information
hover = HoverTool(tooltips=[
    ("Title", "@titles{safe}"),
    ("Abstract", "@abstract{safe}"),
],
                 point_policy="follow_mouse")

# map colors
mapper = linear_cmap(field_name='desc', 
                     palette=Category20[20],
                     low=min(y_labels) ,high=max(y_labels))

# prepare the figure
p = figure(plot_width=800, plot_height=800, 
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'], 
           title="Covid-19 Papers", 
           toolbar_location="right")

# plot
p.scatter('x', 'y', size=5, 
          source=source,
          fill_color=mapper,
          line_alpha=0.3,
          line_color="black",
          legend = 'labels')

# add callback to control 
callback = CustomJS(args=dict(p=p, source=source), code="""
            
            var radio_value = cb_obj.active;
            var data = source.data; 
            
            x = data['x'];
            y = data['y'];
            
            x_backup = data['x_backup'];
            y_backup = data['y_backup'];
            
            labels = data['desc'];
            
            if (radio_value == '20') {
                for (i = 0; i < x.length; i++) {
                    x[i] = x_backup[i];
                    y[i] = y_backup[i];
                }
            }
            else {
                for (i = 0; i < x.length; i++) {
                    if(labels[i] == radio_value) {
                        x[i] = x_backup[i];
                        y[i] = y_backup[i];
                    } else {
                        x[i] = undefined;
                        y[i] = undefined;
                    }
                }
            }


        source.change.emit();
        """)

# callback for searchbar
keyword_callback = CustomJS(args=dict(p=p, source=source), code="""
            
            var text_value = cb_obj.value;
            var data = source.data; 
            
            x = data['x'];
            y = data['y'];
            
            x_backup = data['x_backup'];
            y_backup = data['y_backup'];
            
            abstract = data['abstract'];
            titles = data['titles'];
            
            for (i = 0; i < x.length; i++) {
                if(abstract[i].includes(text_value) || 
                   titles[i].includes(text_value)  {
                    x[i] = x_backup[i];
                    y[i] = y_backup[i];
                } else {
                    x[i] = undefined;
                    y[i] = undefined;
                }
            }
        source.change.emit();
        """)

# option
option = RadioButtonGroup(labels=["C-0", "C-1", "C-2",
                                  "C-3", "C-4", "All"], 
                          active=20, callback=callback)

# search box
keyword = TextInput(title="Search:", callback=keyword_callback)

#header
header = Div(text="""<h1>Covid-19 Papers</h1>""")

# show
show(column(header, widgetbox(option, keyword),p))