In [0]:
import pandas as pd
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('rslp')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import numpy as np
import networkx as nx
!pip install plotly.express
from plotly import graph_objs as go

In [0]:
# obtendo dataset com uma amostra eventos

query = "febre amarela" # query para consultar na base de eventos (max. de 500 respostas)

pd.set_option('display.max_colwidth', -1)
dataset = pd.read_csv('http://websensors.net.br/minicurso/2019/eventos-br-2017.php?q='+query,sep='\t', lineterminator='\n')
dataset

In [0]:
# remoção de pontuacao e stopwords

def remove_stopwords(text,lang,domain_stopwords=[]):
  
  stop_words = nltk.corpus.stopwords.words(lang) # lang='portuguese' or lang='english'
  
  s = str(text).lower() # tudo para caixa baixa
  table = str.maketrans({key: None for key in string.punctuation})
  s = s.translate(table) # remove pontuacao
  tokens = word_tokenize(s) #obtem tokens
  v = [i for i in tokens if not i in stop_words and not i in domain_stopwords and not i.isdigit()] # remove stopwords
  s = ""
  for token in v:
    s += token+" "
  return s.strip()

def stemming(text,lang):
  
  stemmer = PorterStemmer() # stemming para ingles
  
  if lang=='portuguese':
    stemmer = nltk.stem.RSLPStemmer() # stemming para portuguese
    
  tokens = word_tokenize(text) #obtem tokens
  
  sentence_stem = ''
  doc_text_stems = [stemmer.stem(i) for i in tokens]
  for stem in doc_text_stems:
    sentence_stem += stem+" "
    
  return sentence_stem.strip()

# obtendo a VSM com TFIDF
def compute_vsm_tfidf(dataset, lang, domain_stopwords=[]):
  
  d = []
  for index,row in dataset.iterrows():
    text = row['title'] #texto do evento
    text2 = remove_stopwords(text, lang, domain_stopwords)
    text3 = stemming(text2, lang)
    d.append(text3)
  
  matrix = TfidfVectorizer()
  X = matrix.fit_transform(d)
  
  tfidf_vect_df = pd.DataFrame(X.todense(), columns=matrix.get_feature_names())

  return tfidf_vect_df


vsm = compute_vsm_tfidf(dataset,'portuguese')
vsm

In [0]:
# computando dissimilaridade de cosseno

def dis_cosine(matrix, e1, e2):
  dcos = cosine(matrix.iloc[e1,:], matrix.iloc[e2,:])
  return dcos



In [0]:
# calculando a rede de eventos por proximidade de conteudo
def compute_network(matrix, seed=0, min_dcos=0.7,max_neighbors=3,max_nodes=1000):
  
  G=nx.Graph()
  visited = []
  visited.append(seed)
  total = matrix.shape[0]
  temp = {}
  
  while(True):
  
    seed = visited.pop()
    temp[seed]=1
  
    neighbors = {}

    for i in range(0,total):
      if seed==i: continue
      if i in temp: continue
      dcos = dis_cosine(matrix,i,seed)
      if dcos <= min_dcos:
        neighbors[i] = dcos

    sorted_x = sorted(neighbors.items(), key=lambda kv: kv[1])

    counter=0
    for item in sorted_x:
      G.add_edge(seed, item[0], weight=(1-item[1]))
      if item[0] not in temp: visited.append(item[0])
      counter+=1
      if (counter >= max_neighbors): break     

  
    if(len(G) >= max_nodes): break
    if(len(visited)==0): break
    
    
    
  return G
    
G = compute_network(vsm,min_dcos=0.8,seed=1)
nx.info(G)

In [0]:
# visualizando a rede de eventos por proximidade de conteudo
def plot_event_network():
  
  # plotando rede
  pos = nx.drawing.layout.spring_layout(G)
  for item in pos:
    G.node[item]['pos']=[pos[item][0],pos[item][1]]
    
  edge_x = []
  edge_y = []
  for edge in G.edges():
      x0, y0 = G.node[edge[0]]['pos']
      x1, y1 = G.node[edge[1]]['pos']
      edge_x.append(x0)
      edge_x.append(x1)
      edge_x.append(None)
      edge_y.append(y0)
      edge_y.append(y1)
      edge_y.append(None)

  edge_trace = go.Scatter(
      x=edge_x, y=edge_y,
      line=dict(width=0.5, color='#888'),
      hoverinfo='none',
      mode='lines')

  node_x = []
  node_y = []
  for node in G.nodes():
      x, y = G.node[node]['pos']
      node_x.append(x)
      node_y.append(y)

  node_trace = go.Scatter(
      x=node_x, y=node_y,
      mode='markers',
      hoverinfo='text',
      marker=dict(
          showscale=True,
          # colorscale options
          #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
          #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
          #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
          colorscale='YlGnBu',
          reversescale=True,
          color=[],
          size=10,
          colorbar=dict(
              thickness=15,
              title='Node Connections',
              xanchor='left',
              titleside='right'
          )))
  
  node_adjacencies = []
  node_text = []
  for node, adjacencies in enumerate(G.adjacency()):
      node_adjacencies.append(len(adjacencies[1]))

  for node in G.nodes():
    node_text.append("id="+str(node)+" - "+str(dataset.iloc[node,:]['title'])+" - "+str(dataset.iloc[node,:]['local']))
      
  node_trace.marker.color = node_adjacencies
  node_trace.text = node_text
  
  fig = go.Figure(data=[edge_trace, node_trace],
               layout=go.Layout(
                  title='Event Network',
                  showlegend=False,
                  hovermode='closest',
                  margin=dict(b=20,l=5,r=5,t=40),
                  annotations=[ dict(
                      text="",
                      showarrow=False,
                      xref="paper", yref="paper",
                      x=0.005, y=-0.002 ) ],
                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                  )
  fig.show()

plot_event_network()

In [0]:
def simple_label_propagation(G,labels,max_iter=30):

  # inicializando
  for n in G.nodes():
    G.node[n]['f'] = np.array([0.0])
    if n in labels:
      G.node[n]['y'] = np.array([1.0])
      G.node[n]['f'] = np.array([1.0])
  
  for i in range(0,max_iter):
  
    # propagando
    diff = 0
    for node in G.nodes():
      if node in labels: continue
      f_new = np.array([0.0])
      count = 0
      for neighbor in G.neighbors(node):
        f_new += G.node[neighbor]['f']
        count += 1

      f_new /= count
      diff += np.linalg.norm(G.node[node]['f']-f_new)
      G.node[node]['f']=f_new
    print("Iteration #"+str(i)+" e="+str(diff))
    

# selecionando eventos de interesse
labels={}
labels[14]=1
labels[46]=1
simple_label_propagation(G,labels)
G.nodes(data=True)

In [0]:
# visualizando a rede de eventos por proximidade de conteudo+rotulos
def plot_event_network():
  
    
  edge_x = []
  edge_y = []
  for edge in G.edges():
      x0, y0 = G.node[edge[0]]['pos']
      x1, y1 = G.node[edge[1]]['pos']
      edge_x.append(x0)
      edge_x.append(x1)
      edge_x.append(None)
      edge_y.append(y0)
      edge_y.append(y1)
      edge_y.append(None)

  edge_trace = go.Scatter(
      x=edge_x, y=edge_y,
      line=dict(width=0.5, color='#888'),
      hoverinfo='none',
      mode='lines')

  node_x = []
  node_y = []
  for node in G.nodes():
      x, y = G.node[node]['pos']
      node_x.append(x)
      node_y.append(y)

  node_trace = go.Scatter(
      x=node_x, y=node_y,
      mode='markers',
      hoverinfo='text',
      marker=dict(
          showscale=True,
          # colorscale options
          #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
          #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
          #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
          colorscale='Greys',
          reversescale=True,
          color=[],
          size=10,
          colorbar=dict(
              thickness=15,
              title='Confidence',
              xanchor='left',
              titleside='right'
          )))
  
  node_adjacencies = []
  node_text = []
  node_labels = []
  for node, adjacencies in enumerate(G.adjacency()):
      node_adjacencies.append(len(adjacencies[1]))

  for node in G.nodes():
    node_text.append("id="+str(node)+" - "+str(dataset.iloc[node,:]['title'])+" - "+str(dataset.iloc[node,:]['local']))
    
  for node in G.nodes():
    node_labels.append(G.node[node]['f'][0])
      
  node_trace.marker.color = node_labels
  node_trace.text = node_text
  
  fig = go.Figure(data=[edge_trace, node_trace],
               layout=go.Layout(
                  title='Event Network',
                  showlegend=False,
                  hovermode='closest',
                  margin=dict(b=20,l=5,r=5,t=40),
                  annotations=[ dict(
                      text="",
                      showarrow=False,
                      xref="paper", yref="paper",
                      x=0.005, y=-0.002 ) ],
                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                  )
  fig.show()

plot_event_network()