In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/barbara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def get_unique_words(text):
    words = [word for message in text for word in message]
    return set(words)

def co_occurence_matrix_with_window(text, window_size, stop_words):
    '''
    Calculates the co-occurence matrix using a sliding window of size window_size as context    
    Output is (unique) words x (unique) words
    '''
    unique_words = get_unique_words(text)
    n = len(unique_words) #number of unique words
    co_matrix = pd.DataFrame(data=np.zeros([n,n]), columns=unique_words, index=unique_words, dtype='int')

    #for each word in each message, counts +1 for each pair [word,context]
    for message in text:
        msg_len = len(message)
        for i, word in enumerate(message):
            first = max(i - window_size, 0)
            last = min(i + window_size + 1, msg_len)
            for context in message[first:last]:
                co_matrix.loc[word,context] += 1
    return co_matrix  
    
def word_occurence_matrix(text, target=None, stop_words=None, binary=True, preprocess_text=False):  
    '''
    Output is messages x (unique) words
    
    If binary=True, then each element represents if the word is in the message or not.
    Otherwise, it represents the count of how many times that word appears in that message.
    ''' 
    if target:
        text = list(filter(lambda x : target in x, text)) #Filter comments in which target word is present
        
    preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor()   
    if stop_words:        
        stop_words = [preprocessor(word) for word in stop_words] #preprocesses stop words
    if preprocess_text:
        text = [preprocessor(msg) for msg in text] #preprocesses text
        
    #calculates word count for each message
    vectorizer = CountVectorizer(strip_accents='unicode', stop_words=stop_words, binary=binary)
    X = vectorizer.fit_transform(text).toarray()
    
    labels = vectorizer.get_feature_names()
    
    return X, labels

def co_occurence_matrix(word_occurence_matrix):
    '''
    Co-occurence matrix created based on the word count/occurence matrix.     
    In other words, this co-occurence matrix will consider each message as the context for all words in it.
    '''
    co_X = word_occurence_matrix.T @ word_occurence_matrix
    np.fill_diagonal(co_X, 0)
    return co_X

def score_ngrams(text, freq_filter=1, score_metric=BigramAssocMeasures().pmi):
    '''
    Score n-grams using score_metric. Defaults to PMI.
    '''
    words = [word for msg in text for word in msg.split()]
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(freq_filter)
    return finder.score_ngrams(score_metric)

### Dados

In [4]:
comments = pd.read_csv('../comentarios_sorted_votes.csv')
stop_words = [word.rstrip() for word in open('stopwords.txt')]

#Preprocesses text
preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor() #lowercase and strip accents
stop_words = [preprocessor(word) for word in stop_words]
comments['text'] = [preprocessor(msg) for msg in comments['text']]
comments['text'] = [' '.join([word for word in RegexpTokenizer(r'\w+').tokenize(msg) if not word in stop_words])
                    for msg in comments['text']]

text = comments['text'].dropna()

N = 30 #Consider only the N most frequent words

### 1. Co-ocorrência 

#### 1.1. Co-cocorrência com uma palavra pré-definida

In [5]:
target = 'cloroquina'

X, labels = word_occurence_matrix(text, target=target, stop_words=stop_words)

word_count = X.sum(axis=0) #no. of comments containing A in which each word appears
word_count = sorted(zip(labels, word_count), reverse=True, key=lambda tupl : tupl[1])
print(f'{N} palavras que mais ocorrem nos mesmos comentários em que "{target}" ocorre:\n')
print(word_count[:N])

30 palavras que mais ocorrem nos mesmos comentários em que "cloroquina" ocorre:

[('cloroquina', 615), ('hidroxicloroquina', 497), ('ivermectina', 360), ('covid', 209), ('azitromicina', 163), ('medicos', 154), ('tratamento', 153), ('dr', 151), ('sobre', 149), ('uso', 147), ('pessoas', 145), ('pra', 142), ('estudo', 138), ('19', 135), ('medico', 134), ('pode', 119), ('todos', 114), ('bem', 108), ('contra', 106), ('dra', 103), ('tomar', 103), ('agora', 102), ('protocolo', 99), ('https', 96), ('porque', 96), ('sim', 95), ('sintomas', 92), ('saude', 91), ('virus', 91), ('pois', 90)]


#### 1.2. N-gramas

Considerações:

- PMI - favorece n-gramas raros. Muitos n-gramas com a mesma score (a mais alta)

#### PMI

In [6]:
pmi = BigramAssocMeasures().pmi
print(f'{N} com score mais alta:')
score_ngrams(text, score_metric=pmi, freq_filter=30)[:N]

30 com score mais alta:


[(('democraticos', 'solidarios'), 14.776091834429147),
 (('98200', '1348'), 14.650560952345288),
 (('roubos', 'assedios'), 14.535083734925351),
 (('351969602263', 'valdemir'), 14.092565499224401),
 (('watsapp', '351969602263'), 14.065598451624133),
 (('pet', 'shops'), 13.748336168031768),
 (('joaquim', 'inacio'), 13.73620904325611),
 (('debaixo', 'cuberta'), 13.59166726329172),
 (('cestas', 'basicas'), 13.419606517505416),
 (('albert', 'dickson'), 13.386083129641825),
 (('121', '150'), 13.293008947727204),
 (('irmaos', 'irmas'), 13.269739168404357),
 (('sars', 'cov'), 13.226866590935995),
 (('aplicativo', 'protetor'), 13.21167671111208),
 (('botao', 'alerta'), 13.187160431505044),
 (('inimigo', 'invisivel'), 13.18553531723477),
 (('bla', 'bla'), 13.180160669201733),
 (('congestao', 'nasal'), 13.142883752420245),
 (('carol', 'bandeira'), 13.10799553979401),
 (('evite', 'roubos'), 13.055915898226793),
 (('lojas', 'agropecuarias'), 13.023155283351159),
 (('atila', 'iamarino'), 12.99680651

#### likelihood ratio

In [7]:
lr = BigramAssocMeasures().likelihood_ratio
print(f'{N} com score mais alta:')
score_ngrams(text, score_metric=lr, freq_filter=3)[:N]

30 com score mais alta:


[(('covid', '19'), 16892.947470233976),
 (('falta', 'ar'), 16186.704051448698),
 (('olfato', 'paladar'), 14115.073150518603),
 (('gracas', 'deus'), 12369.146510580387),
 (('dor', 'cabeca'), 11614.437174829865),
 (('deus', 'abencoe'), 11288.143898218037),
 (('youtu', 'be'), 11039.053082917235),
 (('dra', 'lucy'), 10155.46757782443),
 (('dr', 'drauzio'), 10016.641748516702),
 (('https', 'youtu'), 8958.369513217573),
 (('sentir', 'cheiro'), 8539.500821771626),
 (('15', 'dias'), 8298.801986814038),
 (('https', 'www'), 8007.512202706735),
 (('lucy', 'kerr'), 7973.473043179992),
 (('3', 'dias'), 7645.367432256628),
 (('boa', 'noite'), 6445.995892016467),
 (('cheiro', 'gosto'), 6187.1150865093),
 (('www', 'youtube'), 6153.505077922475),
 (('corona', 'virus'), 5224.399715203055),
 (('sinto', 'cheiro'), 4572.425125205145),
 (('vitamina', 'c'), 4523.5120698652),
 (('bom', 'dia'), 4473.085106279236),
 (('3', 'comprimidos'), 4406.6050363202885),
 (('lair', 'ribeiro'), 4213.562386985365),
 (('todo'