# Features de TextMining:

---------------------------------

In [1]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pyvis.network import Network
import nltk

import random

import plotly.graph_objects as go

import networkx as nx

!pip install markupsafe==2.0.1

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

### Definindo variáveis e caminhos

In [3]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Funções para execução em batch

In [4]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [5]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors

In [6]:
input_folder_path = r"""C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [7]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 33.6 s


---------------------------------

### Trabalhando no tratamento do texto

In [8]:
import nltk
#import spacy
#import corenlp
#import textblob
#import gensim
#import transformers

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

--------------------------------

In [9]:
from text import text_prep, text_mining, text_viz

In [10]:
tprep = text_prep()

In [11]:
dict_dfs['df_doc_info']['acknowledgement_prep'] = tprep.text_preparation_column(dict_dfs['df_doc_info']['acknowledgement'])
dict_dfs['df_doc_info']['abstract_prep'] = tprep.text_preparation_column(dict_dfs['df_doc_info']['abstract'])
dict_dfs['df_doc_info']['body_prep'] = tprep.text_preparation_column(dict_dfs['df_doc_info']['body'])

--------------------------------

Criando BOW e TFIDF:

In [12]:
tmining = text_mining()

In [13]:
documents_abs = dict_dfs['df_doc_info']['abstract_prep'].fillna(' ').tolist()
documents_body = dict_dfs['df_doc_info']['body_prep'].fillna(' ').tolist()

In [14]:
df_tfidf_abstract_abs = tmining.get_df_tfidf(documents_abs)
df_tfidf_abstract_body = tmining.get_df_tfidf(documents_body)

In [15]:
df_bow_abstract_abs = tmining.get_df_bow(documents_abs)
df_bow_abstract_body = tmining.get_df_bow(documents_body)

In [16]:
df_tfidf_abstract_abs.head()

Unnamed: 0,aa,aa aluminum,aare,ab,ab initio,abaqus,abilities,ability,ability feasible,ability gfa,...,zirconia,zn,zn alloy,zn coat,zncl,zone,zone fz,zr,zr hf,zro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09396,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_bow_abstract_body.head()

Unnamed: 0,aa,aa aa,aa ab,aa alloy,aa aluminium,aa aluminum,aa bb,aa sample,aa solution,aa vector,...,zunger,zunger pseudopotential,zuo,zuo et,zwick,zwickroell,zx,zy,zz,zz xx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,4,4,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_bow_abstract_body.shape

(581, 176907)

--------------------------------

---------------------------------------------

Plotly Network Graph with Plotly and NetworkX

In [19]:
class graph_plotly_networkx(object):
    
    import plotly.graph_objects as go
    import networkx as nx
    
    """This class implements methods and functions for draw a graph-network."""

    def __init__(self):
        pass


    def plot_add_edges(self, G):
        
        """Add edges one by one."""
        
        list_edge_traces = []
        for edge in G.edges():

            edge_x = []
            edge_y = []
            
            x0, y0 = G.nodes[edge[0]]['pos']
            x1, y1 = G.nodes[edge[1]]['pos']
            edge_x.append(x0)
            edge_x.append(x1)
            edge_x.append(None)
            edge_y.append(y0)
            edge_y.append(y1)
            edge_y.append(None)

            edge_trace = go.Scatter(x=edge_x, y=edge_y,
                                    line=dict(width=0.5, # G.edge_size, # 0.5
                                              color='#888'), # G.edge_color), # '#888'),
                                    hoverinfo='none',
                                    mode='lines',
                                    line_shape='spline')
            
            list_edge_traces.append(edge_trace)
            
        return list_edge_traces


    def plot_add_all_edges(self, G, color='#888', width=0.5):
        
        """Add all edges."""
        
        edge_x = []
        edge_y = []    
        
        for edge in G.edges():
            
            x0, y0 = G.nodes[edge[0]]['pos']
            x1, y1 = G.nodes[edge[1]]['pos']
            edge_x.append(x0)
            edge_x.append(x1)
            edge_x.append(None)
            edge_y.append(y0)
            edge_y.append(y1)
            edge_y.append(None)

        edge_trace = go.Scatter(x=edge_x, y=edge_y,
                                line=dict(width=width,
                                          color=color),
                                hoverinfo='none',
                                mode='lines',
                                line_shape='spline')
        return edge_trace


    def plot_add_nodes(self, G):
        
        list_nodes_traces = []
        for node in G.nodes():
            
            node_x = []
            node_y = []
            
            x, y = G.nodes[node]['pos']
            node_x.append(x)
            node_y.append(y)

            node_trace = go.Scatter(x=node_x,
                                    y=node_y,
                                    mode='markers',
                                    hoverinfo='text',
                                    marker=dict(showscale=True,
                                                width=2,
                                                # colorscale options
                                                #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
                                                #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
                                                #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
                                                colorscale='YlGnBu',
                                                reversescale=True,
                                                color=[],
                                                size=10,
                                                colorbar=dict(thickness=15,
                                                            title='Node Connections',
                                                            xanchor='left',
                                                            titleside='right'),
                                                            line_width=2))
            
            
            list_nodes_traces.append(node_trace)
        return list_nodes_traces


    def plot_add_all_nodes(self, G, size_list, color_list, text, opacity, colorbar_title="Node Connections"):
        
        node_x = []
        node_y = []
        
        for node in G.nodes():
            x, y = G.nodes[node]['pos']
            node_x.append(x)
            node_y.append(y)

        # https://plotly.com/python-api-reference/generated/plotly.graph_objects.scatter.html#plotly.graph_objects.scatter.Marker
        
        node_trace = go.Scatter(x=node_x,
                                y=node_y,
                                mode='markers', # text, 
                                hoverinfo='text',
                                opacity=opacity,
                                marker=dict(showscale=True,
                                            # colorscale options
                                            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
                                            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
                                            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
                                            colorscale='YlGnBu',
                                            reversescale=True,
                                            color=color_list,
                                            size=size_list,
                                            colorbar=dict(thickness=15,
                                                          title=colorbar_title,
                                                          xanchor='left',
                                                          titleside='right'),
                                                          line_width=2))
        node_trace.marker.color = color_list
        node_trace.text = text
        
        return node_trace


    def plot_add_all_text(self, G, size_list, color_list, text, opacity, font='Arial'):
        
        list_text_traces = []
        for i,node in enumerate(G.nodes(),0):
            
            node_x = []
            node_y = []
            
            x, y = G.nodes[node]['pos']
            node_x.append(x)
            node_y.append(y)

            text_trace = go.Scatter(x=node_x,
                                    y=node_y,
                                    mode='text', # text, 
                                    # hoverinfo='text',
                                    text=[text[i]],
                                    opacity=opacity[i],
                                    textfont=dict(color=[color_list[i]],
                                                  family=[font],
                                                  size=size_list[i]))
            
            list_text_traces.append(text_trace)
            
        return list_text_traces

Create random graph

In [20]:
list_colours = """aliceblue,antiquewhite,aqua,aquamarine,azure,beige,bisque,black,blanchedalmond,blue,blueviolet,brown,burlywood,cadetblue,chartreuse,chocolate,coral,cornflowerblue,cornsilk,crimson,cyan,darkblue,darkcyan,darkgoldenrod,darkgray,darkgrey,darkgreen,darkkhaki,darkmagenta,darkolivegreen,darkorange,darkorchid,darkred,darksalmon,darkseagreen,darkslateblue,darkslategray,darkslategrey,darkturquoise,darkviolet,deeppink,deepskyblue,dimgray,dimgrey,dodgerblue,firebrick,floralwhite,forestgreen,fuchsia,gainsboro,ghostwhite,gold,goldenrod,gray,grey,green,greenyellow,honeydew,hotpink,indianred,indigo,ivory,khaki,lavender,lavenderblush,lawngreen,lemonchiffon,lightblue,lightcoral,lightcyan,lightgoldenrodyellow,lightgray,lightgrey,lightgreen,lightpink,lightsalmon,lightseagreen,lightskyblue,lightslategray,lightslategrey,lightsteelblue,lightyellow,lime,limegreen,linen,magenta,maroon,mediumaquamarine,mediumblue,mediumorchid,mediumpurple,mediumseagreen,mediumslateblue,mediumspringgreen,mediumturquoise,mediumvioletred,midnightblue,mintcream,mistyrose,moccasin,navajowhite,navy,oldlace,olive,olivedrab,orange,orangered,orchid,palegoldenrod,palegreen,paleturquoise,palevioletred,papayawhip,peachpuff,peru,pink,plum,powderblue,purple,red,rosybrown,royalblue,saddlebrown,salmon,sandybrown,seagreen,seashell,sienna,silver,skyblue,slateblue,slategray,slategrey,snow,springgreen,steelblue,tan,teal,thistle,tomato,turquoise,violet,wheat,white,whitesmoke,yellow,yellowgreen""".split(',')
list_colours = [str(i).lower().strip() for i in list_colours]
list_colours = pd.Series(list_colours)

In [21]:
G = nx.random_geometric_graph(200, 0.125)

gpn = graph_plotly_networkx()

node_adjacencies = []
node_text = []
node_size = []
opacity_list = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('Connections: '+str(len(adjacencies[1])))
    node_size.append(2*len(adjacencies[1]))

node_size_reforced = [s**2 for s in node_size]
node_size_reforced = [np.ceil(20*(s/max(node_size_reforced))) + 2 for s in node_size_reforced]

edge_traces = gpn.plot_add_all_edges(G)
node_trace = gpn.plot_add_all_nodes(G, node_size, node_adjacencies, node_text, 1)
text_trace = gpn.plot_add_all_text(G, 
                                   node_size_reforced,
                                   # [list_colours.sample(1).iat[0] for i in node_size],
                                   ['black' for i in node_size],
                                   len(node_size)*['Testing Words'],
                                   [round(o/max(node_size),3) for o in node_size])

Create Network Graph

In [22]:
fig = go.Figure(data=[edge_traces, node_trace],
                layout=go.Layout(
                    title='<br>Network Graph',
                    titlefont_size=12,
                    height=750,
                    width=None,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=1,l=1,r=1,t=1), # b=20,l=5,r=5,t=40
                    annotations=[dict(text="Graph <a href=''>Link</a>",
                                      showarrow=False,
                                      xref="paper", yref="paper",
                                      x=0.005,
                                      y=-0.002)],
                    xaxis=dict(showgrid=False,
                               zeroline=False,
                               showticklabels=False),
                    yaxis=dict(showgrid=False,
                               zeroline=False,
                               showticklabels=False),)
                )

for trace in text_trace:
    fig.add_trace(trace)

fig.show()

--------------------------------

Making a Graph: CO-CITATION

In [23]:
def getColumnsWithData(df, return_percent=False, n_round=2):
    
    """"""
    
    list_col_with_data = []
    for col in df.columns.tolist():
        rows = df[col].shape
        n_null = df[col].isnull().sum()
        not_null_data_perc = (1-n_null/rows)
        if not_null_data_perc:
            if return_percent:
                list_col_with_data.append((col,np.round(not_null_data_perc, n_round)))
            list_col_with_data.append(col)
            
    return list_col_with_data   

In [24]:
df_doc_info = dict_dfs['df_doc_info'].loc[:,getColumnsWithData(dict_dfs['df_doc_info'])]
df_doc_head = dict_dfs['df_doc_head'].loc[:,getColumnsWithData(dict_dfs['df_doc_head'])]
df_doc_authors = dict_dfs['df_doc_authors'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors'])]
df_doc_authors_citations = dict_dfs['df_doc_authors_citations'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors_citations'])]

In [25]:
df_authors_citations = df_doc_authors_citations.loc[:,['full_name_citation']].reset_index()
df_authors = df_doc_authors.loc[:,['full_name_author']].reset_index()

df_authors_citations = df_authors_citations.drop_duplicates(subset=['article_id','full_name_citation'])
df_authors = df_authors.drop_duplicates(subset=['article_id','full_name_author'])

df_authors_and_citations = df_authors.merge(df_authors_citations, on='article_id')

df_aux = df_authors_and_citations.groupby(by=['full_name_author',
                                              'full_name_citation'],
                                          as_index=False)['article_id'].count()

df_authors_and_citations = df_aux.rename(columns={'article_id':'count',
                                                  'full_name_author':'author',
                                                  'full_name_citation':'citation'})

In [26]:
G = nx.Graph(name="Graph", title="Graph with something...")

In [27]:
# Creating nodes for graph
nodes_author = df_authors_and_citations.author.tolist()
nodes_citation = df_authors_and_citations.citation.tolist()

nodes_author = pd.value_counts(nodes_author)
nodes_author = pd.DataFrame({'node':nodes_author.index.tolist(),
                             'size':nodes_author.values.tolist()})

nodes_citation = pd.value_counts(nodes_citation)
nodes_citation = pd.DataFrame({'node':nodes_citation.index.tolist(),
                               'size':nodes_citation.values.tolist()})

# Select top 10 based in size 
nodes_citation = nodes_citation.nlargest(10, columns='size')

list_cols_edges = ['author','citation','count']
edges = df_authors_and_citations.loc[:,list_cols_edges].copy()

# Select most relevant edges by edge count
filtro_edges = edges.citation.isin(nodes_citation.node.tolist()) # edges.author.isin(nodes.node.tolist()) &
edges = df_authors_and_citations.loc[filtro_edges].copy()
edges = edges.nlargest(100, "count")


final_nodes_author = edges.author.tolist()
final_nodes_author = pd.value_counts(final_nodes_author)
final_nodes_author = pd.DataFrame({'node':final_nodes_author.index.tolist(),
                                   'size':final_nodes_author.values.tolist()})

final_nodes_citation = edges.citation.tolist()
final_nodes_citation = pd.value_counts(final_nodes_citation)
final_nodes_citation = pd.DataFrame({'node':final_nodes_citation.index.tolist(),
                                     'size':final_nodes_citation.values.tolist()})

In [28]:
for i, row in final_nodes_author.iterrows():
    G.add_node(row['node'], size=row['size'])
    
for i, row in final_nodes_citation.iterrows():
    G.add_node(row['node'], size=row['size'])

In [29]:
for i, row in edges.iterrows():
    G.add_edge(row['author'],row['citation'], weight=row['count'])

Usar o MDS para duas dimensões e usar as posições como propriedades do grafo

In [30]:
pos_kkl = nx.kamada_kawai_layout(G)
# pos_spe = nx.spectral_layout(G)
# pos_spr = nx.spring_layout(G)
# pos_cir = nx.circular_layout(G)
# pos_fru = nx.fruchterman_reingold_layout(G)

positions = pos_kkl

In [31]:
node_x = []
node_y = []
node_text = []

for i, row in final_nodes_author.iterrows():
    x = positions[row["node"]][0]
    y = positions[row["node"]][1]
    node_x.append(x)
    node_y.append(y)
    node_text.append(f"{row['node']}<br>Author Conections: {row['size']}")

sizes = [int(10*(1+i/max(final_nodes_author['size']))) for i in final_nodes_author['size']]
node_trace_author = go.Scatter(x=node_x,
                               y=node_y,
                               mode='markers', # text, 
                               hoverinfo='text',
                               opacity=1,
                               marker=dict(size=sizes,
                                           symbol="hexagon",
                                           color="forestgreen"))
# https://plotly.com/python-api-reference/generated/plotly.graph_objects.scatter.html#plotly.graph_objects.scatter.Marker

# color_list = ['black' for i in node_size]
# node_trace.marker.color = color_list
node_trace_author.text = node_text

# Add citation nodes trace
node_x = []
node_y = []
node_text = []

for i, row in final_nodes_citation.iterrows():
    x = positions[row["node"]][0]
    y = positions[row["node"]][1]
    node_x.append(x)
    node_y.append(y)
    node_text.append(f"{row['node']}<br>Citation Conections: {row['size']}")

sizes = [int(10*(1+i/max(final_nodes_citation['size']))) for i in final_nodes_citation['size']]
node_trace_citation = go.Scatter(x=node_x,
                                 y=node_y,
                                 mode='markers', # text, 
                                 hoverinfo='text',
                                 opacity=1,
                                 marker=dict(size=sizes,
                                             symbol="circle",
                                             color="dodgerblue"))
# https://plotly.com/python-api-reference/generated/plotly.graph_objects.scatter.html#plotly.graph_objects.scatter.Marker

# color_list = ['black' for i in node_size]
# node_trace.marker.color = color_list
node_trace_citation.text = node_text

In [32]:
node_size_citation = final_nodes_citation['size'].tolist()
node_size_author = final_nodes_author['size'].tolist()
opacity = [((o-min(node_size_citation))/(max(node_size_citation)-min(node_size_citation)))*(1-0.75)+0.75 for o in node_size_citation]
size_text_author = [((o-min(node_size_author))/(max(node_size_author)-min(node_size_author)))*(16-12)+12 for o in node_size_author]
size_text_citation = [((o-min(node_size_citation))/(max(node_size_citation)-min(node_size_citation)))*(20-16)+16 for o in node_size_citation]

list_text_traces = []

for i, row in final_nodes_author.iterrows():
    
    node_x = []
    node_y = []
    
    x = positions[row['node']][0]
    y = positions[row['node']][1]
    node_x.append(x)
    node_y.append(y)

    text_trace = go.Scatter(x=node_x,
                            y=node_y,
                            mode='text',
                            text=row['node'],
                            opacity=0.5,
                            textfont=dict(color="black",
                                          family='Arial',
                                          size=size_text_author))
    list_text_traces.append(text_trace)
    

for i, row in final_nodes_citation.iterrows():
    
    node_x = []
    node_y = []
    
    x = positions[row['node']][0]
    y = positions[row['node']][1]
    node_x.append(x)
    node_y.append(y)

    text_trace = go.Scatter(x=node_x,
                            y=node_y,
                            mode='text',
                            text=row['node'],
                            opacity=opacity[i],
                            textfont=dict(color="black",
                                          family='Arial',
                                          size=size_text_citation))
    list_text_traces.append(text_trace)

In [33]:
list_edge_traces = []
for edge in G.edges():
    
    edge_x = []
    edge_y = []
    x0 = positions[edge[0]][0]
    y0 = positions[edge[0]][1]
    x1 = positions[edge[1]][0]
    y1 = positions[edge[1]][1]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

    edge_trace = go.Scatter(x=edge_x, y=edge_y,
                            line=dict(width=1,
                                      color='#888'),
                            hoverinfo='none',
                            mode='lines',
                            line_shape='spline')
    list_edge_traces.append(edge_trace)

In [34]:
fig = go.Figure(data=list_edge_traces + [node_trace_author, node_trace_citation],
                layout=go.Layout(
                    title='<br>Co-citation graph',
                    titlefont_size=12,
                    height=750,
                    width=None,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=1,l=1,r=1,t=1), # b=20,l=5,r=5,t=40
                    annotations=[dict(text="Green dots are authors, blue dots are citations.",
                                      showarrow=False,
                                      xref="paper",
                                      yref="paper",
                                      x=0.005,
                                      y=-0.002)],
                    xaxis=dict(showgrid=False,
                               zeroline=False,
                               showticklabels=False),
                    yaxis=dict(showgrid=False,
                               zeroline=False,
                               showticklabels=False),)
                )

# for edge_trace in list_edge_traces:
#     fig.add_trace(edge_trace)
    
for trace in list_text_traces:
    fig.add_trace(trace)

fig.show()