## Co-occurrence graphs
This notebook describes how to create a graph of co-occurrences. For a full description, read [this](https://arxiv.org/pdf/2010.06710.pdf)

In [40]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import numpy as np
import networkx as nx


In [2]:
#nltk.download(*)

#comment this line after you have downloaded the needed modules
#specific things we need if you don't want to download everything:
#english stopwords
#

In [129]:
#Let's define some basic functions
#that are going to be useful

stop_list = stopwords.words('english')

pos_to_wordnet = {
    "J" : wordnet.ADJ,
    "V" : wordnet.VERB,
    "R" : wordnet.ADV
}

def tokenize(string_text):
    return word_tokenize(string_text)

def remove_stopwords(tokenized_text):
    return [word for word in tokenized_text if word not in stop_list]

def wordnet_pos(tokenized_text):
    tagged_words = nltk.pos_tag(tokenized_text)
    wordnet_words_tag = []
    for w, tag in tagged_words:
        first_letter = tag[0]
        if first_letter in pos_to_wordnet:
            wordnet_words_tag.append(pos_to_wordnet[first_letter])
        else:
            wordnet_words_tag.append(wordnet.NOUN)
    return wordnet_words_tag

def lemmatizer(tokenized_text_no_sw, pos_list):
    return [WordNetLemmatizer().lemmatize(word, tag) for word, tag in zip(tokenized_text_no_sw, pos_list)]

def preprocess_text(string_text):
    tokenized_text = tokenize(string_text.lower())
    #tokenized_text_no_sw = remove_stopwords(tokenized_text)
    wordnet_pos_list = wordnet_pos(tokenized_text)
    lemmatized = lemmatizer(tokenized_text, wordnet_pos_list)
    return lemmatized, wordnet_pos_list

In [134]:
class CNetwork(object):
    def __init__(self, text, window):
        padding = ["<PAD>"]*window
        
        self.text = padding + text + padding
        self.window = window
        self.vocabulary = set(self.text)
        self.vocabulary_index = {x:i for i, x in enumerate(self.vocabulary)}
        self.G, self.vocabulary_index = self.get_network()
    
    def get_network(self):
        matrix = np.zeros((len(self.vocabulary), len(self.vocabulary)))
        min_l = self.window 
        max_l = len(self.text) - self.window
        for index, word in enumerate(self.text):
            if index >= min_l and index < max_l:
                index_central_word = self.vocabulary_index[word]
                left_neighbors = self.text[index-self.window:index]
                right_neighbors = self.text[index+1:index+self.window]
                neighbors = set(left_neighbors + right_neighbors)
                
                for neighbor in neighbors:
                    neighbor_index = self.vocabulary_index[neighbor]
                    matrix[index_central_word, neighbor_index] = 1
        G = nx.from_numpy_matrix(matrix)
        return G, self.vocabulary_index
        
    def words_to_index(self, word):
        index = self.vocabulary_index[word]
        return index
    
    def get_degree(self, word):
        index = self.words_to_index(word)
        degree = self.G.degree(index)
        return degree
        
        

In [135]:
t, pos = preprocess_text("this is a test to create a cooccurrences graph")
n = CNetwork(t,2).get_degree('graph')
print(n)

3
