In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [85]:
def get_unique_words(text):
    words = [word for message in text for word in message]
    return set(words)

def co_occurence_matrix(text, window_size, stop_words):
    unique_words = get_unique_words(text)
    n = len(unique_words) #number of unique words
    co_matrix = pd.DataFrame(data=np.zeros([n,n]), columns=unique_words, index=unique_words, dtype='int')

    
    for message in text:
        msg_len = len(message)
        for i, word in enumerate(message):
            first = max(i - window_size, 0)
            last = min(i + window_size + 1, msg_len)
            for context in message[first:last]:
                co_matrix.loc[word,context] += 1
                #co_matrix.loc[context,word] += 1
    return co_matrix        
    
def co_occurence_in_comments_list(text, target, stop_words=None):
    #Filter comments in which target word is present
    text = text[text.str.contains(target)]
    
    preprocessor = CountVectorizer(strip_accents='unicode').build_preprocessor()
    target = preprocessor(target) #preprocesses target word
    stop_words = [preprocessor(word) for word in stop_words] #preprocesses stop words
        
    #calculates word count for each message
    vectorizer = CountVectorizer(strip_accents='unicode', stop_words=stop_words, binary=True)
    X = vectorizer.fit_transform(text).toarray()
       
    
    comments_count = X.sum(axis=0) #no. of comments containing A in which each word appears
    
    return list(zip(vectorizer.get_feature_names(), comments_count))
    
#word2vec?

In [71]:
data = pd.read_csv('../comments.csv', usecols=['text'])['text'].dropna()
stop_words = [word.rstrip() for word in open('stopwords.txt')]
#co_occurence_matrix(data, window_size=2, stop_words=0)

In [93]:
co_list = co_occurence_in_comments_list(data, 'virus', stop_words=stop_words)
co_list = sorted(co_list, reverse=True, key=lambda tupl : tupl[1])

In [94]:
co_list[:30]

[('virus', 1017),
 ('coronavirus', 638),
 ('pessoas', 307),
 ('pra', 222),
 ('pode', 220),
 ('vai', 195),
 ('corona', 189),
 ('ainda', 179),
 ('sobre', 160),
 ('atila', 143),
 ('agora', 140),
 ('brasil', 136),
 ('casa', 133),
 ('aqui', 118),
 ('todos', 117),
 ('video', 114),
 ('vc', 113),
 ('sintomas', 112),
 ('sim', 110),
 ('pois', 108),
 ('fazer', 107),
 ('gente', 107),
 ('bem', 106),
 ('ai', 104),
 ('gripe', 104),
 ('doenca', 102),
 ('entao', 102),
 ('todo', 102),
 ('ficar', 101),
 ('caso', 100)]