In [None]:
import pandas as pd 
import numpy as np
import re

In [None]:
nomes = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv('/home/filipe/Documentos/Redes Complexas/Codigos/7-ProjetoFinal/data/training.1600000.processed.noemoticon.csv',header=None, names=nomes,encoding='latin1')

In [None]:
df.groupby('target')['target'].count()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Criando um grafo direcionado
G = nx.Graph()

# Coletando todos os usuários do conjunto
df['user'].apply(lambda x: G.add_node(x))
G.number_of_nodes()

In [None]:
x = 0
for index, line in df.iterrows():
    replieds = re.findall(r"@(\w+)", df.iloc[index,5])
    if len(replieds) > 2: 
        for replied in replieds: 
            if G.has_node(replied) and G.has_node(df.iloc[index,4]):
                G.add_edge(df.iloc[index,4], replied)

In [None]:
G.remove_edges_from(nx.selfloop_edges(G))

connected = nx.connected_components(G)  # Para grafos direcionados

# Selecionando o maior subgrafo
maior = max(connected, key=len)
G = G.subgraph(maior)

print(G.number_of_nodes())
print(G.number_of_edges())

In [None]:
import math 

def calculate_measures(G):
    
    def degree_distribution(G):
        vk = dict(G.degree())
        vk = list(vk.values())
        vk = np.array(vk)
        maxk = np.max(vk)
        mink = np.min(vk)
        kvalues= np.arange(0,maxk+1) # possible values of k
        Pk = np.zeros(maxk+1) # P(k)
        for k in vk:
            Pk[k] = Pk[k] + 1
        Pk = Pk/sum(Pk) # the sum of the elements of P(k) must to be equal to one
        return kvalues,Pk
    
    def momment_of_degree_distribution(G,m):
        M = 0
        N = len(G)
        for i in G.nodes:
            M = M + G.degree(i)**m
        M = M/N
        return M

    def shannon_entropy(G):
        k,Pk = degree_distribution(G)
        H = 0
        for p in Pk:
            if(p > 0):
                H = H - p*math.log(p, 2)
        return H
    
    # Número de Vértices
    nN = G.number_of_nodes()                    
    print('Number of nodes', nN)
    
    # Número de Arestas
    nE = G.number_of_edges()                    
    print('Number of edges', nE)
    
    # Primeiro momento
    k1 = momment_of_degree_distribution(G,1)    
    print('1º Momentum', k1)
    
    # Segundo momento
    k2 = momment_of_degree_distribution(G,2)    
    print('2º Momentum', k2)
    
    # Variância
    variance = k2 - k1**2                       
    print('Variância Momentum', variance)
    
    # Average Clustering
    av_cl = nx.average_clustering(G)
    print('Average Clustering', av_cl)
    
    # Entropia de Shannon
    sh_ent = shannon_entropy(G)
    print('Shannon Entropy', sh_ent)
    
    # Transitivitidade: Quantidade de Triângulos de um Grafo
    trans = nx.transitivity(G)                      
    print('Transitivity', trans)

    # Diâmetro
    diameter = nx.diameter(G)                   
    print('Diameter', diameter)

    # Eficiência Global da Informação (Eficiência do Grafo)
    gl_ef = nx.global_efficiency(G)             
    print('Global Efficiency', gl_ef)
    
    # Eficiência Local da Informação 
    lc_ef = nx.local_efficiency(G)              
    print('Local Efficiency', lc_ef)
    
    # Average Shortest Path Lenght
    l = nx.average_shortest_path_length(G)      
    print('Average Shortest Path Lenght', l)
    
    # Grau de Assortatividade
    r = nx.degree_assortativity_coefficient(G)  
    print('Assortativity Coefficient', r)


    ################## Medidas de centralidade ####################################
    # Grau Médio
    g = np.mean(list(dict(G.degree()).values()))
    print('Mean Degree', g)
    
    # Média Closeness Centrality
    cl_cent = np.mean(list(nx.closeness_centrality(G).values())) 
    print('Closeness Centrality', cl_cent)
    
    # Média Betweenness Centrality
    bet_cent = np.mean(list(dict(nx.betweenness_centrality(G)).values())) 
    print('Betwenees Centrality', bet_cent)

    # Average Eigenvector Centrality
    eig_value = np.mean(list(dict(nx.eigenvector_centrality(G, max_iter = 1000)).values()))
    print('Average Eigenvector Centrality', eig_value)

    # Page Rank
    page_rank = np.mean(list(dict(nx.pagerank(G, alpha=0.85, max_iter=1000, weight='weight')).values()))
    print('Page rank', page_rank)

    # K-Core
    #kcore = np.mean(list(dict(nx.core_number(G)).values()))
    #print('KCore', kcore)
    kcore=0
    return nN, nE, k1, k2, variance, av_cl, sh_ent,trans, diameter, gl_ef, lc_ef,l, g, cl_cent, bet_cent, eig_value, kcore

In [None]:
calculate_measures(G)

In [None]:
nx.write_graphml(G, "grafo_tweets.graphml")

In [None]:
plt.figure(figsize=(20,8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=50, font_size=8, font_weight='bold')
plt.show()

In [None]:
def identify_keywords(G, number = 10):
    
    degree_scores = dict(G.degree())
    sorted_degree = sorted(degree_scores, key=degree_scores.get, reverse=True)[:number]
    print(sorted_degree)

    pagerank_scores = nx.pagerank(G)
    sorted_pagerank = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)[:number]
    print(sorted_pagerank)
    
    betweenness_scores = nx.betweenness_centrality(G)
    sorted_betweenness = sorted(betweenness_scores, key=betweenness_scores.get, reverse=True)[:number]
    print(sorted_betweenness)
    
    closeness_scores = nx.closeness_centrality(G)
    sorted_closeness = sorted(closeness_scores, key=closeness_scores.get, reverse=True)[:number]
    print(sorted_closeness)
    
    eigenvector_scores = nx.eigenvector_centrality(G, max_iter=1000)
    sorted_eigenvector = sorted(eigenvector_scores, key=eigenvector_scores.get, reverse=True)[:number]
    print(sorted_eigenvector)

    return sorted_degree, sorted_pagerank, sorted_betweenness, sorted_closeness, sorted_eigenvector

In [None]:
identify_keywords(G)

In [None]:
nodes = list(G.nodes())
df_filtered = df.loc[df['user'].isin(nodes)].reset_index()
df_filtered

In [None]:
import pandas as pd
import numpy as np
import spacy
import math
import re
from string import punctuation
from unicodedata import normalize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
import networkx as nx

In [None]:
def preprocessing_text(text):  

    # Retira toda acentuação, cedilhas e normaliza em minúsculas
    text = normalize('NFKD', text).encode('ASCII','ignore').decode('ASCII').lower()

    # Retira todos os dígitos
    text = re.sub('\d+', "", text)
    
    # Retira toda pontuação com variações acrescentadas
    to_get_off = ['¹','²', '³', 'º', 'ª', '==', ',', ':', '\"', '\'']
    to_get_off.extend([*punctuation])
    text = "".join([char if char not in to_get_off else ' ' for char in text])
    
    # Remove espaços sobrando
    text = re.sub('\s+', " ", text).split(" ")
            
    return " ".join(text)

In [None]:
def prepare_graph(palavras):

    # Criar grafo
    G = nx.Graph()

    # Adicionar nós ao grafo na ordem em que aparecem na lista
    for i in range(len(palavras) - 1):
        if not G.has_node(palavras[i]):
            G.add_node(palavras[i])
        if not G.has_node(palavras[i + 1]):
            G.add_node(palavras[i + 1])
    
    # Adicionar arestas ao grafo
    for i in range(len(palavras) - 1):
        G.add_edge(palavras[i], palavras[i + 1])

    # Tranformação para Grafo Não-Direcionado
    G = G.to_undirected()

    # Removendo Self Loops
    G.remove_edges_from(nx.selfloop_edges(G))

    # Ordenando componentes por ordem de tamanho, selecionando o maior
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G = G.subgraph(Gcc[0])
    
    return G

In [None]:
nlp = spacy.load("en_core_web_lg")
df_filtered['preprocessed'] = df_filtered['text'].apply(lambda x: list(word.text for word in nlp(preprocessing_text(str(x))) if not word.is_stop))

In [None]:
keywords = pd.DataFrame(columns=[
    'Keyword Degree','Keyword Pagerank','Keyword Betweenness','Keyword Closeness',
    'Keyword Eigenvector'])

df_metrics = pd.DataFrame(columns=[
    'Nodes','Edges','Momento 1','Momento 2','Variance','Average Clustering',
    'Shannon Entropy','Transitivity','Diameter', 'Global Efficiency', 'Local Efficiency',
    'Average Shortest Path', 'Mean Degree', 'Average Closeness Centrality', 
    'Average Betweenness Centrality', 'Average Eigenvector Centrality', 'K-Core'])

# Controle
x=0
for words in df_filtered['preprocessed']:
    
    # Controle para acompanhar progresso do processamento
    x+=1
    print(x)
    # Preparação do Grafo de Coocorrência - janela de 2 palavras 
    # (é realizado preprocessamento do grafo)
    
    try:
        G = prepare_graph(words)


        # Calculando Métricas Quantitativas
        measures = calculate_measures(G)
        df_metrics.loc[df_metrics.shape[0]] = {
            'Nodes': measures[0],
            'Edges': measures[1],
            'Momento 1': measures[2],
            'Momento 2': measures[3],
            'Variance': measures[4],
            'Average Clustering': measures[5],
            'Shannon Entropy': measures[6],
            'Transitivity': measures[7],
            'Diameter': measures[8],
            'Global Efficiency': measures[9],
            'Local Efficiency': measures[10],
            'Average Shortest Path': measures[11],
            'Mean Degree': measures[12],
            'Average Closeness Centrality': measures[13],
            'Average Betweenness Centrality': measures[14],
            'Average Eigenvector Centrality': measures[15],
            'K-Core': measures[16]
        }

        # Coletando Keywords (com medidas de centralidade)
        sorted_keywords = identify_keywords(G, 10)
        keywords.loc[keywords.shape[0]] = {
            'Keyword Degree':sorted_keywords[0],
            'Keyword Pagerank':sorted_keywords[1],
            'Keyword Betweenness':sorted_keywords[2],
            'Keyword Closeness':sorted_keywords[3],
            'Keyword Eigenvector':sorted_keywords[4]
        }
    except:
        continue

In [None]:
consolidated_data = pd.concat([df_filtered, df_metrics, keywords], axis=1)

In [None]:
consolidated_data['target'].value_counts()

In [None]:
consolidated_data['target'] = consolidated_data['target'].apply(lambda x: 'positive' if x > 0 else 'negative')
consolidated_data.to_csv('dataset_tweets_consolidated.csv', index=False)
consolidated_data