# COMMUNITY ANALYSIS 

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
from itertools import combinations
import re
import string
import unicodedata
import nltk
from bertopic import BERTopic


nltk.download(['stopwords', 'rslp'])
stopwords = nltk.corpus.stopwords.words('portuguese')

ytbr = "camilaloures"


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /home/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/thiago/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


### functions

In [2]:
def clean_text_data(df: pd.DataFrame) -> pd.DataFrame:
    df["comment_text_cleaned"] = df["comment_text"].str.replace("<br>", " ")
    df["comment_text_cleaned"] = convert_to_str(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = convert_lowercase(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_a_links(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = mask_user_handles(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_punctuation(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_non_ascii(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_laughing(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = replace_repeated_letters(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_stopwords(df["comment_text_cleaned"])
    df["comment_text_cleaned"] = remove_single_letters(df["comment_text_cleaned"])
    df = df[df["comment_text_cleaned"].str.strip() != ""]
    return df

def convert_to_str(words):
    new_words = []
    for word in words:
        if type(word) != str:
            word = f"{word}"
            print(type(word), word)
        new_words.append(word)        
    return new_words

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def remove_stopwords(words):
    result = []
    for word in words:
        processed_text = ''
        list_words = word.split()
        for w in list_words:
            if w not in stopwords:
                processed_text = processed_text + ' ' + w  
        result.append(processed_text)
    return result


def convert_lowercase(words):
    lower = []
    for s in words:
        # if type(s) == str:
        lower.append(s.lower())
        # else:
        #     lower.append(s)
    return lower



def remove_a_links(words):
    cleaned_words = []
    for word in words:
        # if type(word) == str:
        cleaned_words.append(re.sub(r'<a\s+href="[^"]*">[^<]*<\/a>', '', word))
        # else:
        #     cleaned_words.append(word)
    return cleaned_words


def remove_punctuation(words):
    result = []
    for word in words: 
        word = _replace_html_codes(word)
        result.append(''.join(c for c in word if c not in string.punctuation))
    return result

def _replace_html_codes(text):
    html_escape_table = {
        "&amp;": "&",
        "&quot;": '"',
        "&apos;": "'",
        "&gt;": ">", 
        "&lt;": "<", 
    }
    
    for code, value in html_escape_table.items():
        text = text.replace(code, value)
    
    return text

def remove_laughing(words):
    # remove kkkkkkk's
    new_words = [re.sub(r'k{2,}', '', word, flags=re.IGNORECASE) for word in words]

    # remove other laughing patterns
    laugh_pattern = r'\b(kk|ak|ka|ks|sk)[a-zA-Z]*?(kk|ak|ka|ks|sk)\b'
    new_words  = [re.sub(laugh_pattern, '', word, flags=re.IGNORECASE) for word in new_words]
    return new_words

def remove_single_letters(words):
    result = []
    for w in words:
        split_w = w.split(" ")
        new_w = [word for word in split_w if len(word) > 1] 
        result.append(" ".join(new_w))
    return result

def replace_repeated_letters(words):
    return [re.sub(r'(.)\1+', r'\1', word) for word in words]

def mask_user_handles(words):
    handle_pattern = r'@[\w]+'

    return [re.sub(handle_pattern, '@user', word) for word in words]

###  getting community 

In [3]:
# natanporai_lowest_clustering_coefs = [2, 4, 10, 12, 13]
# natanporai_highes_clustering_coefs = [7, 9, 11, 14, 15, 17, 22, 26]
# cadresplayer_lowest_clustering_coefs = [1,2,5,6,3]
# cadresplayer_highes_clustering_coefs = [0,8,10]

In [4]:
community_df = pd.read_csv(f"../data/{ytbr}/metrics/communities.csv")
# central_communities = community_df[community_df["community"].isin(cadresplayer_lowest_clustering_coefs)]
# isolated_communities = community_df[community_df["community"].isin(cadresplayer_highes_clustering_coefs)]

## Reading DF filtered by single commenters

In [5]:
df = pd.read_csv(f"../data/{ytbr}/comments.csv")

current = "all_communities"
df = df[df["comment_author_channel_id"].isin(community_df["commenter"])]
df = pd.merge(df, community_df, left_on="comment_author_channel_id",
                       right_on="commenter", how="left").drop(columns=["commenter"])

In [6]:
df = clean_text_data(df)
df['comment_text_cleaned'].to_csv('out.csv', index=False)

## Bertopic

In [7]:
model = BERTopic(
    language="multilingual",
    verbose=True,
)
topics, probs = model.fit_transform(df["comment_text_cleaned"].to_list())
freq = model.get_topic_info()
display(freq)

2024-10-29 01:41:29,859 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 518/518 [00:12<00:00, 41.54it/s]
2024-10-29 01:41:47,227 - BERTopic - Embedding - Completed ✓
2024-10-29 01:41:47,228 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-29 01:42:12,677 - BERTopic - Dimensionality - Completed ✓
2024-10-29 01:42:12,678 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5077,-1_video_vai_videos_nao,"[video, vai, videos, nao, so, camila, ja, gent...","[amo videos glenda aila, camis acho dica quald..."
1,0,283,0_nada_nunca_ninguem_nao,"[nada, nunca, ninguem, nao, bigode, nu, dica, ...","[nada ver, nada ver, nao nada ver]"
2,1,277,1_up_verem__,"[up, verem, , , , , , , , ]","[up, up, up]"
3,2,273,2_video_videos_nese_dese,"[video, videos, nese, dese, grava, ese, faz, p...",[sonho conhecer vcs pesoalmente grava varios v...
4,3,231,3_24_horas_24h_24hrs,"[24, horas, 24h, 24hrs, hrs, 24horas, fazenda,...","[24 horas, 24 horas, 24 horas]"
...,...,...,...,...,...
334,333,11,333_japonesa_coreria_corea_round,"[japonesa, coreria, corea, round, atacando, vi...",[amo canal todos voces nunca imaginei xis fose...
335,334,11,334_luta_desista_sonhos_youtuber,"[luta, desista, sonhos, youtuber, suceso, desi...",[to luta faz tempo nao vou desistir youtuber s...
336,335,10,335_foganoli_poser_piniao_menota,"[foganoli, poser, piniao, menota, chutem, foga...","[cams foganoli, cams foganoli, foganoli fogano..."
337,336,10,336_zoa_palse_fortinho_istaga,"[zoa, palse, fortinho, istaga, istagram, chute...",[camila vi olho xis marom escuro ta bom falado...


## Barchart

In [8]:
model.visualize_barchart(top_n_topics=20)

## Hierarchical Clustering

In [9]:
# model.visualize_hierarchy(top_n_topics=50, width=1200)

### Similarity Matrix

In [10]:
# model.visualize_heatmap(top_n_topics=50, width=1200)

## Build Topics DF

In [11]:
# atribuíndo tópico
df_topics = df.drop(columns=['comment_author_name','comment_publish_date',]).copy()
df_topics['topics'] = topics

# atribuíndo nome do tópico
topic_name = freq.drop(columns=['Count']).rename(columns={'Topic': 'topics', 'Name': 'name'})
df_topics = df_topics.merge(topic_name, how='left') 

# atribuíndo topic proba
df_topics['topic_proba'] = probs

df_topics["comment_text"] = convert_to_str(df_topics["comment_text"])
df_topics["comment_text"] = remove_a_links(df_topics["comment_text"])
df_topics["comment_text"] = remove_laughing(df_topics["comment_text"])
df_topics["comment_text"] = mask_user_handles(df_topics["comment_text"])
df_topics["comment_text"] = remove_punctuation(df_topics["comment_text"])
df_topics["comment_text"] = remove_non_ascii(df_topics["comment_text"])

## Sentiment Analysis

In [12]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, config=config, device=0, max_length=512, truncation=True)

def classify_comment_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    sentiments = []
    sentiment_scores = []

    for comment in df['comment_text']:
        result = sentiment_task(comment)
        
        sentiment = result[0]['label']
        score = result[0]['score']
        
        sentiments.append(sentiment)
        sentiment_scores.append(score)

    df['sentiment'] = sentiments
    df['sentiment_score'] = sentiment_scores

    return df

In [13]:
df_topics = classify_comment_sentiment(df_topics)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [14]:
df_topics['topic_sentiment'] = df_topics['name'] + " " + "(" + df_topics['sentiment'] + ")"

In [15]:
df_topics.to_csv(f"../data/{ytbr}/topics_sentiment_{current}.csv")