In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
import re

In [2]:
def word_occurence_matrix(text, target=None, stop_words=None, binary=True, preprocess_text=False):  
    '''
    Output is messages x (unique) words
    
    If binary=True, then each element represents if the word is in the message or not.
    Otherwise, it represents the count of how many times that word appears in that message.
    ''' 
    if target:
        text = list(filter(lambda x : target in x, text)) #Filter comments in which target word is present
        
    preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor()   
    if stop_words:        
        stop_words = [preprocessor(word) for word in stop_words] #preprocesses stop words
    if preprocess_text:
        text = [preprocessor(msg) for msg in text] #preprocesses text
        
    #calculates word count for each message
    vectorizer = CountVectorizer(strip_accents='unicode', stop_words=stop_words, binary=binary)
    X = vectorizer.fit_transform(text).toarray()
    
    labels = vectorizer.get_feature_names()
    
    return X, labels

### Dados

In [3]:
comments = pd.read_csv('../../dados/instagram/filtered_comments.csv')
stop_words = [word.rstrip() for word in open('../stopwords.txt')]

preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor() 
#lowercase and strip accents
stop_words = [preprocessor(word) for word in stop_words]
#English and Spanish Words
stop_words.extend(['you', 'good', 'the', 'to', 'live', 'very', 'your', 'work', 'is', 'my', 'from', 'love', 'and', 
                   'in', 'thank', 'informative', 'are', 'of', 'un', 'english', 'what', 'mi', 'hello', 'el', 
                   'but', 'doctor'])
#Nomes Próprios
stop_words.extend(['drauzio', 'varella', 'lucy', 'kerr', 'julio', 'pereira','fernando', 'pinto', 'gomes', 'lair', 
                   'ribeiro', 'alvaro', 'galvao', 'felipe', 'ades', 'gomez', 'alain', 'dutra'])
#Pronomes e preposições
stop_words.extend(['pra', 'vc', 'todos', 'tudo', 'cada', 'nada', 'sobre'])
#Conjunções
stop_words.extend(['porque', 'pois', 'pq'])
#Advérbios
stop_words.extend(['assim', 'bem', 'ainda', 'agora', 'sim', 'sempre', 'aqui', 'la', 'tbm', 'ai', 'hoje'])
#Verbos frequentes
stop_words.extend(['vai', 'ser', 'ter', 'ta', 'fazer', 'fiz', 'faz', 'vou'])
#Outros
stop_words.extend(['boa', 'bom', 'obrigado', 'ola'])

#Remove urls
comments['texto_do_comentario'] = [re.sub(r'http\S+', '', msg) for msg in comments['texto_do_comentario']]
#Remove emails
comments['texto_do_comentario'] = [re.sub(r'\S*@\S*\s?', '', msg) for msg in comments['texto_do_comentario']]
#lowercase and strip accents
comments['texto_do_comentario'] = [preprocessor(msg) for msg in comments['texto_do_comentario']]
#remove stopwords and punctuation
comments['texto_do_comentario'] = [' '.join([word for word in RegexpTokenizer(r'\w+').tokenize(msg) 
                                             if not word in stop_words])
                                  for msg in comments['texto_do_comentario']]
# Remove new line characters
comments['texto_do_comentario'] = [re.sub(r'\s+', ' ', msg) for msg in comments['texto_do_comentario']]
# Remove distracting single quotes
comments['texto_do_comentario'] = [re.sub(r"\'", "", msg) for msg in comments['texto_do_comentario']]
#Remove special characters
comments['texto_do_comentario'] = [re.sub(r'([^a-zA-Z0-9\s]+?)', '', msg) for msg in comments['texto_do_comentario']]

#Remove words with freq == 1
fdist = FreqDist(RegexpTokenizer(r'\w+').tokenize(' '.join(comments['texto_do_comentario'])))
freq_filter = filter(lambda x: x[1] == 1, fdist.items())
fdist = [t[0] for t in freq_filter]
comments['texto_do_comentario'] = [' '.join([word for word in RegexpTokenizer(r'\w+').tokenize(msg) 
                                             if not word in fdist])
                                  for msg in comments['texto_do_comentario']]

N = 30 #Consider only the N most frequent words

In [4]:
comments_by_channel = []
channels = []
for channel, group in comments.groupby('dono_do_post'):
    channel_comments = ' '.join(group['texto_do_comentario'])
    comments_by_channel.append(channel_comments)
    channels.append(channel)

### Frequencia relativa

#### 1. Em relação ao total
Não usar essa. Deixei aqui por precaução.

Aqui foi usada a distribuição de probabilidade conjunta das palavras e canais, i.e., $p(a_i, c_j) = P(A=a_i, C= C_j)$.

In [11]:
X, labels = word_occurence_matrix(comments_by_channel, stop_words=stop_words, binary=False)

X = X / X.sum()

freq_lists = [list(zip(['%.4f'%(freq) for freq in row],labels)) for row in X]
freq_lists = [sorted(channel, reverse=True) for channel in freq_lists]
df_freq = pd.DataFrame(freq_lists, index = channels).T

df_freq.head(10)

Unnamed: 0,dr.galvao,dralaindutra,drfelipeades,drfernandoneuro,drlairribeiro,imedlkep,juliommais,sitedrauziovarella
0,"(0.0001, parabens)","(0.0007, vitamina)","(0.0002, dr)","(0.0140, dr)","(0.0010, dr)","(0.0003, tomar)","(0.0001, parabens)","(0.0064, dr)"
1,"(0.0001, dr)","(0.0007, dr)","(0.0001, virus)","(0.0055, dia)","(0.0003, saude)","(0.0003, ivermectina)","(0.0001, deus)","(0.0042, obrigada)"
2,"(0.0000, zycze)","(0.0005, tomar)","(0.0001, uso)","(0.0038, deus)","(0.0002, virus)","(0.0003, dra)","(0.0000, zycze)","(0.0031, pessoas)"
3,"(0.0000, zyciu)","(0.0004, pode)","(0.0001, triste)","(0.0037, parabens)","(0.0002, sistema)","(0.0002, dose)","(0.0000, zyciu)","(0.0031, casa)"
4,"(0.0000, zycie)","(0.0004, obrigada)","(0.0001, tratamento)","(0.0027, lindo)","(0.0002, senhor)","(0.0002, dias)","(0.0000, zycie)","(0.0029, virus)"
5,"(0.0000, zyc)","(0.0004, dias)","(0.0001, tomar)","(0.0027, it)","(0.0002, obrigada)","(0.0001, video)","(0.0000, zyc)","(0.0025, pode)"
6,"(0.0000, zwei)","(0.0003, sol)","(0.0001, saber)","(0.0026, noite)","(0.0002, melhor)","(0.0001, uso)","(0.0000, zwei)","(0.0024, video)"
7,"(0.0000, zusammen)","(0.0003, saude)","(0.0001, risco)","(0.0022, doutor)","(0.0002, medico)","(0.0001, unica)","(0.0000, zusammen)","(0.0024, senhor)"
8,"(0.0000, zur)","(0.0003, pessoas)","(0.0001, pode)","(0.0021, that)","(0.0002, imunidade)","(0.0001, tratamento)","(0.0000, zur)","(0.0018, deus)"
9,"(0.0000, zumbido)","(0.0003, parabens)","(0.0001, pessoas)","(0.0020, we)","(0.0002, falar)","(0.0001, tomei)","(0.0000, zumbido)","(0.0017, saude)"


In [12]:
#Selects only the N with higher std
word_std = X.std(axis=0)
higher_std_zipped = sorted(zip(word_std, labels, X.T), reverse=True)[:N]
word_std, labels, X_t = zip(*higher_std_zipped)

word_std_i = ['%.4f'%(x) for x in word_std]
print(f'{N} palavras com maior desvio padrão entre os canais:\n')
print(list(zip(labels, word_std_i)))

30 palavras com maior desvio padrão entre os canais:

[('dr', '0.0047'), ('dia', '0.0018'), ('obrigada', '0.0014'), ('deus', '0.0013'), ('parabens', '0.0012'), ('pessoas', '0.0010'), ('casa', '0.0010'), ('virus', '0.0010'), ('it', '0.0009'), ('lindo', '0.0009'), ('noite', '0.0008'), ('senhor', '0.0008'), ('pode', '0.0008'), ('video', '0.0008'), ('doutor', '0.0008'), ('that', '0.0007'), ('we', '0.0007'), ('saude', '0.0006'), ('all', '0.0006'), ('this', '0.0006'), ('have', '0.0006'), ('with', '0.0006'), ('be', '0.0006'), ('abencoe', '0.0006'), ('medico', '0.0005'), ('amo', '0.0005'), ('gente', '0.0005'), ('covid', '0.0005'), ('dias', '0.0005'), ('tarde', '0.0005')]


#### 2. Em relação ao canal
Foi obtida a distribuição de probabilidade das palavras em cada canal $C_k$, i.e., $p(a_i|C_k) = P(A=a_i|C_k)$. 
Com isso, podemos fazer corretamente comparações entre os canais. Foram, então, selecionadas as palavras cuja ocorrência mais se difere entre eles.

In [5]:
X, labels = word_occurence_matrix(comments_by_channel, stop_words=stop_words, binary=False)
#Makes frequencies relative to each channel so that we have the probability distribution of words for each channel
X = (X.T / X.sum(axis=1)).T

freq_lists = [list(zip(['%.4f'%(freq) for freq in row],labels)) for row in X]
freq_lists = [sorted(channel, reverse=True) for channel in freq_lists]
df_freq = pd.DataFrame(freq_lists, index = channels).T

print('Palavras com maior frequência relativa em cada canal:')
df_freq.head(10)

Palavras com maior frequência relativa em cada canal:


Unnamed: 0,dr.galvao,dralaindutra,drfelipeades,drfernandoneuro,drlairribeiro,imedlkep,juliommais,sitedrauziovarella
0,"(0.0362, parabens)","(0.0129, dr)","(0.0158, dr)","(0.0309, dr)","(0.0304, dr)","(0.0200, tomar)","(0.0822, deus)","(0.0150, dr)"
1,"(0.0362, dr)","(0.0125, vitamina)","(0.0129, deus)","(0.0121, dia)","(0.0093, saude)","(0.0185, dra)","(0.0789, parabens)","(0.0099, obrigada)"
2,"(0.0125, 15)","(0.0086, tomar)","(0.0120, triste)","(0.0084, deus)","(0.0079, deus)","(0.0178, ivermectina)","(0.0362, abencoe)","(0.0073, pessoas)"
3,"(0.0111, medico)","(0.0074, dias)","(0.0109, pessoas)","(0.0081, parabens)","(0.0070, imunidade)","(0.0127, dias)","(0.0263, saude)","(0.0072, casa)"
4,"(0.0111, excelente)","(0.0070, obrigada)","(0.0091, risco)","(0.0060, lindo)","(0.0068, obrigada)","(0.0122, dose)","(0.0263, continue)","(0.0068, virus)"
5,"(0.0084, ji)","(0.0065, pode)","(0.0088, tratamento)","(0.0060, it)","(0.0065, melhor)","(0.0089, pode)","(0.0197, feliz)","(0.0058, pode)"
6,"(0.0084, doutor)","(0.0059, sol)","(0.0088, casa)","(0.0057, noite)","(0.0062, virus)","(0.0089, covid)","(0.0197, aniversario)","(0.0057, senhor)"
7,"(0.0084, deus)","(0.0059, dia)","(0.0070, parabens)","(0.0048, doutor)","(0.0058, sistema)","(0.0077, comprimidos)","(0.0164, abencoando)","(0.0056, video)"
8,"(0.0070, video)","(0.0055, dose)","(0.0068, pode)","(0.0047, that)","(0.0052, falar)","(0.0075, video)","(0.0132, vida)","(0.0042, deus)"
9,"(0.0070, pessoa)","(0.0052, pessoas)","(0.0068, grupo)","(0.0044, we)","(0.0051, senhor)","(0.0073, deus)","(0.0132, proteja)","(0.0040, saude)"


In [6]:
#Selects only the N with higher std
word_std = X.std(axis=0)
higher_std_zipped = sorted(zip(word_std, labels, X.T), reverse=True)[:N]
word_std, labels, X_t = zip(*higher_std_zipped)

word_std_i = ['%.4f'%(x) for x in word_std]
print(f'{N} palavras com maior desvio padrão entre os canais:\n')
print(list(zip(labels, word_std_i)))

30 palavras com maior desvio padrão entre os canais:

[('parabens', '0.0255'), ('deus', '0.0248'), ('abencoe', '0.0112'), ('dr', '0.0110'), ('continue', '0.0085'), ('saude', '0.0075'), ('aniversario', '0.0065'), ('feliz', '0.0063'), ('tomar', '0.0063'), ('dra', '0.0061'), ('ivermectina', '0.0057'), ('abencoando', '0.0054'), ('felicidades', '0.0043'), ('dose', '0.0042'), ('proteja', '0.0042'), ('dias', '0.0040'), ('15', '0.0040'), ('vitamina', '0.0040'), ('triste', '0.0038'), ('vida', '0.0038'), ('excelente', '0.0034'), ('sucesso', '0.0031'), ('amigo', '0.0031'), ('dia', '0.0031'), ('pessoas', '0.0030'), ('senhor', '0.0030'), ('doutor', '0.0030'), ('grande', '0.0029'), ('video', '0.0029'), ('familia', '0.0029')]
