# Data Cleaning 

## Imports

In [1]:
import pandas as pd
import json
import unicodedata
import re

import nltk 
from nltk.corpus import stopwords 

import gensim 
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess 
from gensim.models import LdaModel, LsiModel  

from bertopic import BERTopic
import string

# from enelvo.normaliser import Normaliser
# norm = Normaliser(tokenizer='readable')

PATH = "../data/felipeneto/comments.csv"

  from .autonotebook import tqdm as notebook_tqdm


## Data Cleaning

In [10]:
nltk.download('wordnet') 
nltk.download('punkt')  
nltk.download('stopwords') 
language = 'portuguese'  

stopwords = stopwords.words(language) 

stop_list = ['ja','viu','vai','ne','ta','gente','nao','aqui',
             'tambem','vc','voce','entao','ate','agora','ser',
             'sempre','ter','so','porque','pq','tal','pra','pro',
             'sobre','ainda','la','tudo','ninguem','de',
             'to', 'dps', 'tbm','cm', 'dai'] ## ver se tem stopwords melhor, risadas ???
stopwords += stop_list
print(stopwords)


['a', 'à', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'às', 'até', 'com', 'como', 'da', 'das', 'de', 'dela', 'delas', 'dele', 'deles', 'depois', 'do', 'dos', 'e', 'é', 'ela', 'elas', 'ele', 'eles', 'em', 'entre', 'era', 'eram', 'éramos', 'essa', 'essas', 'esse', 'esses', 'esta', 'está', 'estamos', 'estão', 'estar', 'estas', 'estava', 'estavam', 'estávamos', 'este', 'esteja', 'estejam', 'estejamos', 'estes', 'esteve', 'estive', 'estivemos', 'estiver', 'estivera', 'estiveram', 'estivéramos', 'estiverem', 'estivermos', 'estivesse', 'estivessem', 'estivéssemos', 'estou', 'eu', 'foi', 'fomos', 'for', 'fora', 'foram', 'fôramos', 'forem', 'formos', 'fosse', 'fossem', 'fôssemos', 'fui', 'há', 'haja', 'hajam', 'hajamos', 'hão', 'havemos', 'haver', 'hei', 'houve', 'houvemos', 'houver', 'houvera', 'houverá', 'houveram', 'houvéramos', 'houverão', 'houverei', 'houverem', 'houveremos', 'houveria', 'houveriam', 'houveríamos', 'houvermos', 'houvesse', 'houvessem', 'houvésse

[nltk_data] Downloading package wordnet to /home/thiago/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/thiago/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,id,youtuber,comment_id,comment_text,comment_like_count
0,0,felipeneto,UgxusoW07JzG031wtex4AaABAg,felipe a participação da equipe está muito boa...,742
1,1,felipeneto,UgwOUtQJShA4s3nCH0t4AaABAg,"O timing da Samantha pra piadas é incrível, eu...",143
2,2,felipeneto,Ugx7vKkJf9m8Khu6zAN4AaABAg,"<a href=""https://www.youtube.com/watch?v=pstd5...",432
3,3,felipeneto,Ugy8DTP6iTzSCRMI9d94AaABAg,"Oi gente, sei que não é adequado, mas tô fican...",33
4,4,felipeneto,UgyrFDBC8cV1WlgueFF4AaABAg,Eu queria muito viver 1 só dia no planeta em q...,200


In [20]:
def convert_lowercase(words):
    lower = []
    for s in words:
        lower.append(s.lower())
    return lower


def remove_non_ascii(words):
    ascii_words = []
    for word in words:
       w = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
       ascii_words.append(w)
    return ascii_words 

def remove_stopwords(words):
    result = []
    for doc in words:
        processed_doc = []
        for word in simple_preprocess(str(doc)):
            if word not in stopwords:
                processed_doc.append(word)
        result.append(processed_doc)
    return result

def _replace_html_codes(text):
    html_escape_table = {
        "&amp;": "&",
        "&quot;": '"',
        "&apos;": "'",
        "&gt;": ">", 
        "&lt;": "<", 
    }
    
    for code, value in html_escape_table.items():
        text = text.replace(code, value)
    
    return text

def remove_punctuation(words):
    result = []
    html_escape_table = {
        "&amp;": "&",
        "&quot;": '"',
        "&apos;": "'",
        "&gt;": ">", 
        "&lt;": "<", 
    }
    for word in words: 
        word = _replace_html_codes(word)
        result.append(''.join(c for c in word if c not in string.punctuation))
    return result

def remove_a_links(words):
    cleaned_words = []
    for word in words:
        cleaned_words.append(re.sub(r'<a\s+href="[^"]*">[^<]*<\/a>', '', word))
    return cleaned_words


_df = df.copy()

_df["comment_text"] = _df["comment_text"].str.replace("<br>", " ")
_df["comment_text"] = convert_lowercase(_df["comment_text"])
_df["comment_text"] = remove_a_links(_df["comment_text"])
_df["comment_text"] = remove_punctuation(_df["comment_text"])
_df["comment_text"] = remove_non_ascii(_df["comment_text"])
# _df["comment_text"] = remove_stopwords(_df["comment_text"])
comments_list = _df["comment_text"].tolist()


In [31]:
dictionary = corpora.Dictionary(comments_list)

corpus = [dictionary.doc2bow(doc) for doc in comments_list]

lda = LdaModel(doc_term_matrix, 
               num_topics=10, 
               id2word=dictionary)

print(lda.print_topics())

[(0, '0.011*"mano" + 0.011*"incrivel" + 0.010*"primeiro" + 0.009*"vi" + 0.008*"vou" + 0.008*"abencoe" + 0.008*"pensei" + 0.008*"achei" + 0.007*"kkkk" + 0.007*"sam"'), (1, '0.021*"ai" + 0.017*"felipe" + 0.015*"ver" + 0.010*"vitor" + 0.010*"nome" + 0.007*"deus" + 0.007*"sei" + 0.007*"renda" + 0.007*"casado" + 0.007*"video"'), (2, '0.016*"faz" + 0.013*"felipe" + 0.010*"victor" + 0.010*"amo" + 0.009*"exatamente" + 0.008*"fefo" + 0.008*"videos" + 0.007*"kkkkkkkkk" + 0.006*"tanto" + 0.006*"msm"'), (3, '0.027*"felipe" + 0.019*"oi" + 0.013*"vdd" + 0.013*"video" + 0.012*"kkkkkkk" + 0.012*"los" + 0.010*"lives" + 0.009*"brino" + 0.008*"sao" + 0.008*"canal"'), (4, '0.023*"dia" + 0.017*"kkkkkk" + 0.015*"cade" + 0.014*"concordo" + 0.013*"felipe" + 0.012*"bom" + 0.010*"nome" + 0.009*"sete" + 0.009*"aconteceu" + 0.007*"ano"'), (5, '0.023*"up" + 0.012*"queria" + 0.011*"kkkkk" + 0.010*"bem" + 0.009*"felipe" + 0.008*"fica" + 0.007*"ficou" + 0.007*"video" + 0.006*"hahaha" + 0.006*"nada"'), (6, '0.043*"sim

In [21]:
docs = _df["comment_text"].tolist()
bert = BERTopic(language="portuguese")
bert.fit_transform(docs)


([22,
  10,
  0,
  112,
  14,
  -1,
  0,
  -1,
  80,
  1,
  -1,
  17,
  31,
  9,
  20,
  -1,
  0,
  0,
  0,
  9,
  2,
  20,
  0,
  0,
  -1,
  -1,
  -1,
  16,
  68,
  -1,
  -1,
  31,
  -1,
  -1,
  31,
  -1,
  5,
  -1,
  18,
  35,
  9,
  22,
  -1,
  -1,
  27,
  2,
  2,
  -1,
  -1,
  7,
  2,
  27,
  -1,
  86,
  -1,
  10,
  59,
  -1,
  -1,
  67,
  95,
  95,
  88,
  0,
  20,
  10,
  2,
  3,
  -1,
  14,
  -1,
  -1,
  11,
  -1,
  20,
  -1,
  0,
  122,
  11,
  67,
  22,
  59,
  -1,
  -1,
  0,
  0,
  3,
  31,
  -1,
  83,
  -1,
  -1,
  -1,
  11,
  -1,
  20,
  5,
  -1,
  0,
  9,
  86,
  0,
  -1,
  70,
  10,
  -1,
  2,
  2,
  63,
  -1,
  4,
  -1,
  -1,
  2,
  -1,
  39,
  22,
  70,
  31,
  -1,
  -1,
  70,
  -1,
  10,
  59,
  -1,
  2,
  7,
  0,
  -1,
  130,
  -1,
  10,
  103,
  -1,
  -1,
  19,
  70,
  3,
  70,
  11,
  28,
  -1,
  2,
  1,
  0,
  4,
  20,
  0,
  9,
  -1,
  2,
  71,
  -1,
  2,
  63,
  11,
  70,
  13,
  0,
  -1,
  9,
  2,
  18,
  0,
  60,
  -1,
  60,
  46,
  -1,
  -1,
  112,
  54,
  39,

In [33]:
bert.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2147,-1_nao_felipe_de_que,"[nao, felipe, de, que, bruno, do, com, eu, da,...",[acho incrivel como felipe fica bem mais feliz...
1,0,690,0_kkkkk_kkkkkk_kkkkkkk_kkkk,"[kkkkk, kkkkkk, kkkkkkk, kkkk, kkkkkkkk, kkkkk...","[o final foi kkkkkkk, cara eu amo o mozka kkkk..."
2,1,222,1_lordz1nho20_gabrielcarlos2125_craxus8107_falei,"[lordz1nho20, gabrielcarlos2125, craxus8107, f...","[lordz1nho20 mds, lordz1nho20 mds, lordz1nho20..."
3,2,208,2_video_videos_esse_quero,"[video, videos, esse, quero, um, desse, bom, m...",[eu amei esse video quero mais videos desse jo...
4,3,173,3_pensei_eu_tambem_tive,"[pensei, eu, tambem, tive, ja, aconteceu, quas...","[sim pensei que era so eu que tinha percebido,..."
...,...,...,...,...,...
129,128,11,128_ksksksksksskkskskskskksksskksksksksksksksk...,[ksksksksksskkskskskskksksskkskskskskskskskksk...,[ksksksksksskkskskskskksksskkskskskskskskskksk...
130,129,11,129_hotel_capsula_menor_quarto,"[hotel, capsula, menor, quarto, copacabana, qu...","[pensava que era o hotel capsula, pode ser o m..."
131,130,11,130_camisa_camiseta_comprei_rebulico,"[camisa, camiseta, comprei, rebulico, samantha...","[a samantha revivendo a camisa do rebulico, eu..."
132,131,11,131_ne_neee_problem_not,"[ne, neee, problem, not, my, infelizmente, nao...","[ne, ne, ne]"
