# COMMUNITY ANALYSIS 

In [4]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
from itertools import combinations
import re
import string
import unicodedata
import nltk
from bertopic import BERTopic


nltk.download(['stopwords', 'rslp'])
stopwords = nltk.corpus.stopwords.words('portuguese')

# ytbr = "felipeneto"
ytbr = "enaldinho"


[nltk_data] Downloading package stopwords to /home/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/thiago/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


### functions

In [6]:
def clean_text_data(df: pd.DataFrame) -> pd.DataFrame:
    df["comment_text_cleaned"] = df["comment_text"].str.replace("<br>", " ")
    df["comment_text_cleaned"] = convert_lowercase(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_a_links(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = mask_user_handles(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_punctuation(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_non_ascii(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_laughing(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = replace_repeated_letters(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_stopwords(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_single_letters(df["comment_text_cleaned"])
    df = df[df["comment_text_cleaned"].str.strip() != ""]
    return df


def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def remove_stopwords(words):
    result = []
    for word in words:
        processed_text = ''
        list_words = word.split()
        for w in list_words:
            if w not in stopwords:
                processed_text = processed_text + ' ' + w  
        result.append(processed_text)
    return result


def convert_lowercase(words):
    lower = []
    for s in words:
        lower.append(s.lower())
    return lower



def remove_a_links(words):
    cleaned_words = []
    for word in words:
        cleaned_words.append(re.sub(r'<a\s+href="[^"]*">[^<]*<\/a>', '', word))
    return cleaned_words


def remove_punctuation(words):
    result = []
    for word in words: 
        word = _replace_html_codes(word)
        result.append(''.join(c for c in word if c not in string.punctuation))
    return result

def _replace_html_codes(text):
    html_escape_table = {
        "&amp;": "&",
        "&quot;": '"',
        "&apos;": "'",
        "&gt;": ">", 
        "&lt;": "<", 
    }
    
    for code, value in html_escape_table.items():
        text = text.replace(code, value)
    
    return text

def remove_laughing(words):
    # remove kkkkkkk's
    new_words = [re.sub(r'k{2,}', '', word, flags=re.IGNORECASE) for word in words]

    # remove other laughing patterns
    laugh_pattern = r'\b(kk|ak|ka|ks|sk)[a-zA-Z]*?(kk|ak|ka|ks|sk)\b'
    new_words  = [re.sub(laugh_pattern, '', word, flags=re.IGNORECASE) for word in new_words]
    return new_words

def remove_single_letters(words):
    result = []
    for w in words:
        split_w = w.split(" ")
        new_w = [word for word in split_w if len(word) > 1] 
        result.append(" ".join(new_w))
    return result

def replace_repeated_letters(words):
    return [re.sub(r'(.)\1+', r'\1', word) for word in words]

def mask_user_handles(words):
    handle_pattern = r'@[\w]+'

    return [re.sub(handle_pattern, '@user', word) for word in words]

In [3]:
# # Example DataFrame
# deita = {'commenter_id': ['commenter1', 'commenter2', 'commenter3', 'commenter4'],
#         'comment_text': ['akakakakakak this is fun', 'sksksksk botafogo is the best', 'kkkkk that was funny', 'fodase @juninhocabecudo'],
#         'video_id': ['video1', 'video1', 'video2', 'video3']}

# fd = pd.DataFrame(deita)

# fd = clean_text_data(fd)
# fd.head()

###  getting video-comments network 

In [7]:
G = pickle.load(open(f"../data/{ytbr}/video_commenter_network.pickle", 'rb'))
print(f"Graph Nodes: {G.number_of_nodes()}")
print(f"Graph Edges: {G.number_of_edges()}")

Graph Nodes: 19348
Graph Edges: 22484


In [8]:
commenters = [node for node in G.nodes() if len(node) > 11]
videos = [node for node in G.nodes() if len(node) == 11]

print(f"Number of video nodes {len(videos)}")
print(f"Number of commenter nodes {len(commenters)}")

Number of video nodes 50
Number of commenter nodes 19298


In [9]:
single_video_commenters = [c for c in commenters if G.degree(c) == 1]
len(single_video_commenters)

17743

In [10]:
del G

## Reading DF filtered by single commenters

In [11]:
df = pd.read_csv(f"../data/{ytbr}/comments.csv")
df = df[df["comment_author_channel_id"].isin(single_video_commenters)]
# df.head()

In [12]:
df = clean_text_data(df)

df['comment_text_cleaned'].to_csv('out.csv', index=False)

## Bertopic

In [13]:
model = BERTopic(
    language="multilingual",
    verbose=True,
)
topics, probs = model.fit_transform(df["comment_text_cleaned"].to_list())
freq = model.get_topic_info()
display(freq)

2024-09-25 00:18:52,677 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 537/537 [00:11<00:00, 47.30it/s]
2024-09-25 00:19:09,337 - BERTopic - Embedding - Completed ✓
2024-09-25 00:19:09,338 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-25 00:19:34,160 - BERTopic - Dimensionality - Completed ✓
2024-09-25 00:19:34,161 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked,

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4518,-1_voce_nao_dinho_nada,"[voce, nao, dinho, nada, ta, vai, ai, so, pra,...",[hoje hein vc vai ter fazer vou fazer pra vc t...
1,0,489,0_oi_oao_oj_iuh1,"[oi, oao, oj, iuh1, oq, joao, , , , ]","[oi, oi, oi]"
2,1,444,1_video_videos_ese_faz,"[video, videos, ese, faz, postou, outro, legai...","[enaldinho ja vi ese video, video voce faz ena..."
3,2,419,2_enaldinho_enaudinho_isolou_enaldinhoh,"[enaldinho, enaudinho, isolou, enaldinhoh, tor...","[enaldinho, enaldinho, enaldinho]"
4,3,231,3_vc_ta_entrou_brincadeira,"[vc, ta, entrou, brincadeira, tambem, xingou, ...","[ate proprio enaldi nho entrou brincadeira, fa..."
...,...,...,...,...,...
350,349,10,349_love_valentinauser_apaixonada_anita,"[love, valentinauser, apaixonada, anita, amamo...","[love, love, love love]"
351,350,10,350_best_dotu_feito_mr,"[best, dotu, feito, mr, bem, achar, pasar, leg...","[oi dotu bem, iso cara mr best, achar vai pasa..."
352,351,10,351_levantaran_inscrevime_els_theo,"[levantaran, inscrevime, els, theo, praime, br...","[enaldo volta praime, inscrevime vai enaldinho..."
353,352,10,352_saudavel_amarvai_riscar_qu,"[saudavel, amarvai, riscar, qu, chapeu, predio...",[voce pode comer coisa saudavel coisa nao saud...


## Barchart

In [14]:
model.visualize_barchart(top_n_topics=20)

## Hierarchical Clustering

In [15]:
model.visualize_hierarchy(top_n_topics=50, width=1200)

### Similarity Matrix

In [17]:
# model.visualize_heatmap(top_n_topics=50, width=1200)

## Build Topics DF

In [18]:
# atribuíndo tópico
df_topics = df.drop(columns=['comment_author_name','comment_publish_date',]).copy()
df_topics['topics'] = topics

# atribuíndo nome do tópico
topic_name = freq.drop(columns=['Count']).rename(columns={'Topic': 'topics', 'Name': 'name'})
df_topics = df_topics.merge(topic_name, how='left') 

# atribuíndo topic proba
df_topics['topic_proba'] = probs

df_topics["comment_text"] = remove_a_links(df_topics["comment_text"])
df_topics["comment_text"] = remove_laughing(df_topics["comment_text"])
df_topics["comment_text"] = mask_user_handles(df_topics["comment_text"])
df_topics["comment_text"] = remove_punctuation(df_topics["comment_text"])
df_topics["comment_text"] = remove_non_ascii(df_topics["comment_text"])

## Sentiment Analysis

In [19]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, config=config, device=0, max_length=512, truncation=True)

def classify_comment_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    sentiments = []
    sentiment_scores = []

    for comment in df['comment_text']:
        result = sentiment_task(comment)
        
        sentiment = result[0]['label']
        score = result[0]['score']
        
        sentiments.append(sentiment)
        sentiment_scores.append(score)

    df['sentiment'] = sentiments
    df['sentiment_score'] = sentiment_scores

    return df

In [20]:
df_topics = classify_comment_sentiment(df_topics)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [21]:
df_topics['topic_sentiment'] = df_topics['name'] + " " + "(" + df_topics['sentiment'] + ")"


In [22]:
df_topics.head()

Unnamed: 0,id,video_id,video_title,comment_id,comment_text,comment_author_channel_id,comment_like_count,comment_reply_count,is_reply,parent_comment_id,comment_text_cleaned,topics,name,Representation,Representative_Docs,topic_proba,sentiment,sentiment_score,topic_sentiment
0,0,iUBD_IS32VI,ACORDEI O MEU AMIGO COM UMA CABRA! #shorts,UgwMVGpjEjQut8aICUd4AaABAg,o enaldo tocando violao e a cabra aaaaaaaaaah...,UCQ7F9jKoGU1jv294C4WB2bA,4,0,False,0,enaldo tocando violao cabra ahah abriu boca,-1,-1_voce_nao_dinho_nada,"[voce, nao, dinho, nada, ta, vai, ai, so, pra,...",[hoje hein vc vai ter fazer vou fazer pra vc t...,0.0,neutral,0.409725,-1_voce_nao_dinho_nada (neutral)
1,4,iUBD_IS32VI,ACORDEI O MEU AMIGO COM UMA CABRA! #shorts,UgxRsN_oLy0eIRPA1114AaABAg.A8B6kXMDqPNA8DLufljeqt,among us mas os inocentes morrem se nao conseg...,UC1OCUt9EDzDvxT_OPGxhBDw,1,0,True,UgxRsN_oLy0eIRPA1114AaABAg,among us inocentes morem nao conseguirem dinhe...,119,119_dinheiro_rico_pensa_pesoas,"[dinheiro, rico, pensa, pesoas, gastou, gastan...",[ese dinheiro enaldinho gasta podia ajudar pes...,0.840809,negative,0.876035,119_dinheiro_rico_pensa_pesoas (negative)
2,5,iUBD_IS32VI,ACORDEI O MEU AMIGO COM UMA CABRA! #shorts,Ugwq6S4ONWY6PRW2taR4AaABAg,O grito da cabra me quebro,UCfEPXQmfVKUgDi-yQXLOQNw,4,0,False,0,grito cabra quebro,-1,-1_voce_nao_dinho_nada,"[voce, nao, dinho, nada, ta, vai, ai, so, pra,...",[hoje hein vc vai ter fazer vou fazer pra vc t...,0.0,negative,0.930355,-1_voce_nao_dinho_nada (negative)
3,16,iUBD_IS32VI,ACORDEI O MEU AMIGO COM UMA CABRA! #shorts,UgyGPqtYpgxXlBQTSNl4AaABAg,Ficou muito bom,UC3JLPnz0UFk3GJzpOTzwYkA,1,0,False,0,ficou bom,-1,-1_voce_nao_dinho_nada,"[voce, nao, dinho, nada, ta, vai, ai, so, pra,...",[hoje hein vc vai ter fazer vou fazer pra vc t...,0.0,positive,0.905104,-1_voce_nao_dinho_nada (positive)
4,17,iUBD_IS32VI,ACORDEI O MEU AMIGO COM UMA CABRA! #shorts,UgzGrzXzg6wNPc42Yph4AaABAg,TE AMO ENALDINHO VOCE E O MELHOR YOUTUBE R,UCfZIFYsHos9cxnfxPQuXzvg,1,0,False,0,amo enaldinho voce melhor youtube,4,4_youtube_youtuber_youtubers_melhor,"[youtube, youtuber, youtubers, melhor, favorit...","[enaldinho vc melhor youtube, enaldinho melhor...",1.0,positive,0.878908,4_youtube_youtuber_youtubers_melhor (positive)


In [23]:
df_topics.to_csv(f"../data/{ytbr}/topics_sentiment.csv")

In [None]:
# plot most relevant terms for topic