In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [2]:
def word_occurence_matrix(text, target=None, stop_words=None, binary=True, preprocess_text=False):  
    '''
    Output is messages x (unique) words
    
    If binary=True, then each element represents if the word is in the message or not.
    Otherwise, it represents the count of how many times that word appears in that message.
    ''' 
    if target:
        text = list(filter(lambda x : target in x, text)) #Filter comments in which target word is present
        
    preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor()   
    if stop_words:        
        stop_words = [preprocessor(word) for word in stop_words] #preprocesses stop words
    if preprocess_text:
        text = [preprocessor(msg) for msg in text] #preprocesses text
        
    #calculates word count for each message
    vectorizer = CountVectorizer(strip_accents='unicode', stop_words=stop_words, binary=binary)
    X = vectorizer.fit_transform(text).toarray()
    
    labels = vectorizer.get_feature_names()
    
    return X, labels

### Dados

In [3]:
comments = pd.read_csv('../comentarios_sorted_votes.csv')
stop_words = [word.rstrip() for word in open('stopwords.txt')]

#Preprocesses text
preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor() #lowercase and strip accents
stop_words = [preprocessor(word) for word in stop_words]
comments['text'] = [preprocessor(msg) for msg in comments['text']]
comments['text'] = [' '.join([word for word in RegexpTokenizer(r'\w+').tokenize(msg) if not word in stop_words])
                    for msg in comments['text']]

N = 30 #Consider only the N most frequent words

In [4]:
comments_by_channel = []
channels = []
for channel, group in comments.groupby('uploader'):
    channel_comments = ' '.join(group['text'])
    comments_by_channel.append(channel_comments)
    channels.append(channel)

### Frequencia relativa

#### 1. Em relação ao total
Aqui foi usada a distribuição de probabilidade conjunta das palavras e canais, i.e., $p(a_i, c_j) = P(A=a_i, C= C_j)$.

In [21]:
X, labels = word_occurence_matrix(comments_by_channel, stop_words=stop_words, binary=False)

X = X / X.sum()
df_total = pd.DataFrame(X.T, columns = channels, index = labels, dtype='float')
print('Slice de exemplo do df:')
df_total[12080:12085]

Slice de exemplo do df:


Unnamed: 0,Dr. Alain Dutra,Dr. Alvaro Galvão,Dr. Felipe Ades MD PhD,Dr. Fernando Gomes,Dr. Lair Ribeiro Oficial,Drauzio Varella,Julio Pereira - Neurocirurgião,Lucy Kerr
covid,0.00062,0.000159,5.9e-05,6e-06,9e-05,0.000912,0.001879,0.000897
covid19,6.2e-05,9e-06,2e-06,2e-06,8e-06,0.000122,0.000191,0.000104
covid19afarsadadecada,0.0,0.0,0.0,0.0,4e-06,0.0,0.0,0.0
covid2019tr,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0
covid_19,1e-06,0.0,0.0,0.0,0.0,1e-06,0.0,1e-06


In [None]:
#Selects only the N with higher std
word_std = X.std(axis=0)
higher_std_zipped = sorted(zip(word_std, labels, X.T), reverse=True)[:N]
word_std, labels, X_t = zip(*higher_std_zipped)

word_std_i = ['%.4f'%(x) for x in word_std]
print(f'{N} palavras com maior desvio padrão entre os canais:\n')
print(list(zip(labels, word_std_i)))

#### 2. Em relação ao canal
Foi obtida a distribuição de probabilidade das palavras em cada canal $C_k$, i.e., $p(a_i|C_k) = P(A=a_i|C_k)$. 
Com isso, podemos fazer corretamente comparações entre os canais. Foram, então, selecionadas as palavras cuja ocorrência mais se difere entre eles.

In [22]:
X, labels = word_occurence_matrix(comments_by_channel, stop_words=stop_words, binary=False)
#Makes frequencies relative to each channel so that we have the probability distribution of words for each channel
X = (X.T / X.sum(axis=1)).T
df_canal = pd.DataFrame(X.T, columns = channels, index = labels)
print('Slice de exemplo do df:')
df_total[12080:12085]

Slice de exemplo do df:


Unnamed: 0,Dr. Alain Dutra,Dr. Alvaro Galvão,Dr. Felipe Ades MD PhD,Dr. Fernando Gomes,Dr. Lair Ribeiro Oficial,Drauzio Varella,Julio Pereira - Neurocirurgião,Lucy Kerr
covid,0.00062,0.000159,5.9e-05,6e-06,9e-05,0.000912,0.001879,0.000897
covid19,6.2e-05,9e-06,2e-06,2e-06,8e-06,0.000122,0.000191,0.000104
covid19afarsadadecada,0.0,0.0,0.0,0.0,4e-06,0.0,0.0,0.0
covid2019tr,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0
covid_19,1e-06,0.0,0.0,0.0,0.0,1e-06,0.0,1e-06


In [None]:
#Selects only the N with higher std
word_std = X.std(axis=0)
higher_std_zipped = sorted(zip(word_std, labels, X.T), reverse=True)[:N]
word_std, labels, X_t = zip(*higher_std_zipped)

word_std_i = ['%.4f'%(x) for x in word_std]
print(f'{N} palavras com maior desvio padrão entre os canais:\n')
print(list(zip(labels, word_std_i)))