In [None]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import community as community_louvain
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from networkx.algorithms.community import girvan_newman,  greedy_modularity_communities


# Sentiment_analysis

In [None]:
cols = ['preprocessed','Keyword Degree', 'Keyword Pagerank', 'Keyword Betweenness', 'Keyword Closeness', 'Keyword Eigenvector']
df = pd.read_csv('dataset_tweets_consolidated.csv')
for c in cols: df[c] = df[c].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
sa = pd.DataFrame()

In [None]:
def sentiment_vader(x: str):
    """A ideia do Vader é passar a frase inteira, incluindo emoticon e pontuação."""
    text = SentimentIntensityAnalyzer().polarity_scores(x)
    return text['compound']

In [None]:
def sentiment_textblob(x: str):
    """A ideia do TextBlob é analisar uma sentença inteira. Analisando as diferenças,
    são mínimas entre o texto com pontuação e sem pontuação. O seu intervalo é equivalente
    ao do Vader, variando de [-1,1]."""
    sentiment = SentimentIntensityAnalyzer()
    return TextBlob(x).sentiment.polarity

In [None]:
sa['Vader Text'] = df['text'].apply(lambda x: sentiment_vader(x))
sa['TextBlob Text'] = df['text'].apply(lambda x: sentiment_textblob(x))


In [None]:
for c in cols:
    sa['TextBlob '+c] = df[c].apply(lambda x: sentiment_textblob(" ".join(x)))
    sa['Vader '+c] = df[c].apply(lambda x: sentiment_vader(" ".join(x)))


In [None]:
sa

In [None]:
sa.to_csv('dataset_tweets_vader_textblob.csv', index=False)

# Analysis 

In [None]:
df_consolidated = pd.read_csv('dataset_tweets_consolidated.csv')
df_vader_textblob = pd.read_csv('dataset_tweets_vader_textblob.csv')
df_rnn = pd.read_csv('dataset_tweets_rnn.csv')
df_transformer = pd.read_csv('dataset_tweets_transformer.csv')
df_cnn = pd.read_csv('dataset_tweets_cnn.csv')
df_nbag = pd.read_csv('dataset_tweets_nbag.csv')
df = pd.concat([df_consolidated,df_vader_textblob,df_rnn,df_transformer,df_cnn,df_nbag], axis=1, join='outer')
df

In [None]:
df.columns

In [None]:
col = ['Vader Text','TextBlob Text', 'TextBlob preprocessed', 
       'Vader preprocessed', 'TextBlob Keyword Degree', 'Vader Keyword Degree',
       'TextBlob Keyword Pagerank', 'Vader Keyword Pagerank',
       'TextBlob Keyword Betweenness', 'Vader Keyword Betweenness',
       'TextBlob Keyword Closeness', 'Vader Keyword Closeness',
       'TextBlob Keyword Eigenvector', 'Vader Keyword Eigenvector']

In [None]:
def prediction_tostring(x):
    if x >= 0: return 'positive'
    return 'negative'

for c in col: df[c] = df[c].apply(lambda x: prediction_tostring(x))
df

In [None]:
df['target'].value_counts()

In [None]:
col = ['Vader Text', 'TextBlob Text', 'TextBlob preprocessed', 'Vader preprocessed',
       'TextBlob Keyword Degree', 'Vader Keyword Degree',
       'TextBlob Keyword Pagerank', 'Vader Keyword Pagerank',
       'TextBlob Keyword Betweenness', 'Vader Keyword Betweenness',
       'TextBlob Keyword Closeness', 'Vader Keyword Closeness',
       'TextBlob Keyword Eigenvector', 'Vader Keyword Eigenvector',
       'RNN Prediction', 'Transformer Prediction', 'CNN Prediction',
       'Neural Bag of Words Prediction']

# Calcular medidas de avaliação
acuracia = {'Medidas': [], 'Valores': []}
precision = {'Medidas': [], 'Valores': []}
recall = {'Medidas': [], 'Valores': []}
f1score = {'Medidas': [], 'Valores': []}

for c in col:
    acuracia['Medidas'].append(c)
    acuracia['Valores'].append(accuracy_score(df['target'], df[c]))
    precision['Medidas'].append(c)
    precision['Valores'].append(precision_score(df['target'], df[c],pos_label='positive'))
    recall['Medidas'].append(c)
    recall['Valores'].append(recall_score(df['target'], df[c],pos_label='positive'))
    f1score['Medidas'].append(c)
    f1score['Valores'].append(f1_score(df['target'], df[c], pos_label='positive'))
    
metrics = {
    'Acurácia': acuracia,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1score
}

In [None]:
for m in metrics:
    # Definir o número de cores desejado (ajuste se necessário)
    num_cores = len(metrics[m]['Medidas'])  
    
    # Escolher o mapa de cores
    cmap = plt.colormaps['tab20']  # Escolha o mapa de cores desejado
    
    # Obter as cores do mapa de cores
    colors = [cmap(i) for i in np.linspace(0, 1, num_cores)]
    
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x='Medidas', y='Valores', data=metrics[m], palette=colors, hue=metrics[m]['Medidas'])

    for p in ax.patches:
        ax.annotate('{:.3f}%'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=8, fontweight='bold', color='black', xytext=(0, 5),
                textcoords='offset points')
    
    # Adicionar título
    plt.title(f'Gráfico de Barras Comparando {m}', fontsize=16, fontweight='bold')
    plt.xticks(rotation=90)
    
    # Exibir o gráfico
    plt.show()

# word_cloud

In [None]:
graph = nx.read_graphml('./data/grafo_tweets.graphml')

In [None]:
print(f'Nodes: {len(list(graph.nodes))}')
print(f'Edges: {len(list(graph.edges))}')

In [None]:
community = community_louvain.best_partition(graph)

louvain = {node: com for node, com in community.items()}

In [None]:
print(f'Número de comunidades: {len(set(louvain.values()))}')

In [None]:
community = girvan_newman(graph)

communities = next(community)  

girvan_newman = {n: i for i, c in enumerate(communities) for n in c}

In [None]:
print(f'Número de comunidades: {len(set(girvan_newman.values()))}')

In [None]:
community = list(greedy_modularity_communities(graph))

greedy_modularity = {n: i for i, c in enumerate(community) for n in c}

In [None]:
print(f'Número de comunidades: {len(set(greedy_modularity.values()))}')

In [None]:
data = {
    'user': list(graph.nodes),
    'louvain': [louvain[x] for x in list(graph.nodes)],
    'girvan_newman': [girvan_newman[x] for x in list(graph.nodes)],
    'greedy_modularity': [greedy_modularity[x] for x in list(graph.nodes)]
}

df_community = pd.DataFrame(data)

In [None]:
df_community


In [None]:
df_consolidated = pd.read_csv('./data/dataset_tweets_consolidated.csv')

In [None]:
df_consolidated = df_consolidated.drop_duplicates(subset=['user'])

In [None]:
def conv(text):
    try:
        lista = ast.literal_eval(text)
        if isinstance(lista, list):
            return lista
        else:
            return []
    except ValueError:
        return []

for kw in ['Keyword Betweenness', 'Keyword Closeness', 'Keyword Eigenvector']:
    df_consolidated[kw] = df_consolidated[kw].apply(conv)

In [None]:
df_consolidated

In [None]:
df_cloud = pd.merge(df_community, df_consolidated, on=['user'], how='inner')

In [None]:
map_target = {
    'positive': 1,
    'negative': -1
}

df_cloud['target_num'] = df_cloud['target'].map(map_target)

In [None]:
df_cloud

In [None]:
communities_types = ['louvain', 'girvan_newman', 'greedy_modularity']

for community_type in communities_types:
    for community in sorted(df_cloud[community_type].unique()):
        df = df_cloud[df_cloud[community_type] == community].copy()
        
        df_cloud.loc[df_cloud[community_type] == community, f'{community_type}_mean'] = df['target_num'].mean()

In [None]:
df_cloud[df_cloud['louvain_mean'] > 0]['louvain'].unique()

In [None]:
df_cloud[df_cloud['girvan_newman_mean'] > 0]['girvan_newman'].unique()

In [None]:
df_cloud[df_cloud['greedy_modularity_mean'] > 0]['greedy_modularity'].unique()

In [None]:
def cloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
def cloud_user(keyword, user):
    communities_types = ['louvain', 'girvan_newman', 'greedy_modularity']
    communities_text = []
    
    user_community = df_cloud[df_cloud['user'] == user]
    
    for community_type in communities_types:
        community = user_community[community_type].iloc[0]
        df = df_cloud[df_cloud[community_type] == community].copy()
        
        text = ' '.join(df[keyword].astype(str))
        communities_text.append(text)
        
        print(f'================================= COMMUNITY ({community_type}) #{community} =================================')
        cloud(text)
    
    print(f'================================= SIMILARITY ({community_type}) =================================')
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(communities_text)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    for i in range(len(communities_types)):
        for j in range(i):
            print(f"{communities_types[i]} vs {communities_types[j]}: {similarity_matrix[i, j]}")

In [None]:
users = df_cloud.sample(1)['user'].to_numpy()
# Mudar para preprocessed

for user in users:
    print(f'\n\n\n######################################### {user} #########################################\n')
    cloud_user('Keyword Betweenness', user)

In [None]:
def plot_cloud_community(keyword, community_type):
    for community in sorted(df_cloud[community_type].unique()):
        print(f'================================= COMMUNITY ({community_type}) #{community} =================================')
        
        df = df_cloud[df_cloud[community_type] == community]
        
        text = ' '.join(df[keyword].astype(str))
        
        cloud(text)

In [None]:
plot_cloud_community('Keyword Betweenness', 'greedy_modularity')

In [None]:
plot_cloud_community('Keyword Betweenness', 'girvan_newman')

In [None]:
plot_cloud_community('Keyword Betweenness', 'louvain')