In [2]:
import spacy
import pandas as pd
from spacy import displacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
import requests, json

nlp = spacy.load('pt')

In [2]:
corpus = pd.read_csv('data/Raw/quinhentas_perguntas.csv', sep=',', header=0)
answers = pd.read_csv('data/Raw/quinhentas_respostas.csv', sep=",", header=0)
corpus['resposta'] = answers['resposta']

In [75]:
from IPython.display import clear_output
from multiprocessing import Manager, Pool
import wikipedia
import time
start_time = time.time()
from threading import RLock
lock = RLock()


def searchRelation(idx, terms, page, word, nonexistant, disambiguate = True):
    try:
        page = wikipedia.page(page)
        #This will move trough the pages linked to this one.
        g = dict()
        g[word] = set()
        for token in page.links:
            tLemma = lemmatizer(token, "NOUN")[0]
            if tLemma in terms:
                g[word].add(tLemma)
        lock.acquire()
        graph.update(g)
        clear_output()
        print(idx, "Current runtime: %.4f seconds"% (time.time() - start_time))
        lock.release()
    except wikipedia.DisambiguationError as e:
        #Ambiguous - need to try another
        if disambiguate == True:
            for option in e.options:
                searchRelation(idx, g, terms, option, word, nonexistant, False)
    except wikipedia.PageError as e:
        #No page with ID
        #An automated attempt would be to use some dictionary to try "related" words and have a score for them.
        nonexistant.append(word)
    except KeyError as e:
        #print(e)
        nonexistant.append(word)
    except ConnectionError as e:
        #print(e)
        nonexistant.append(word)
    
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
wikipedia.set_lang('pt')
terms = pd.read_csv('data/TreatedTerms/tfidfterms.csv')
bigrams = pd.read_csv('data/TreatedTerms/bigrams.csv')
trigrams = pd.read_csv('data/TreatedTerms/trigrams.csv')
bigramset = set(bigrams[bigrams.columns[1]])
trigramset = set(trigrams[trigrams.columns[1]])
terms = pd.concat([terms, bigrams], ignore_index=True)
terms = pd.concat([terms, trigrams], ignore_index=True)
termset = set(terms[terms.columns[1]])
termset = termset.union(trigramset)
termset = termset.union(bigramset)

#Parallel Data Structures 
manager = Manager()
#Ensure access to parallel modules, lock is used to deal with mess
global graph
graph = manager.dict()
termset = manager.dict(dict.fromkeys(termset, 0))
nonexistant = manager.list()

#Parallel!!
pool = Pool(processes=50)
results = [pool.apply_async(searchRelation, args=(idx, termset, row[1], row[1], nonexistant)) for idx, row in terms.iterrows()]

#print("Final Runtime: %.4f seconds --- Finished!" % (time.time() - start_time))

1127 Current runtime: 63.9435 seconds


In [77]:
#print(nonexistant)
newlist = ['enxerto', 'decumbens', 'aveia', 'inseminação', 'lactação', 'secreção', 'aglomeração', 'ração', 'ordenha', 'cisto folicular', 'quarto mamário', 'digestível total', 'inimigo natural', 'silo aéreo', 'adubo', 'dieta completa', 'reserva corporal', 'babesia', 'índice reprodutivo', 'exigência nutricional', 'Tobiatã', 'solo ácido', 'espécie arbórea']
newtermset = set(terms[terms.columns[1]])
newtermset = newtermset.union(trigramset)
newtermset = newtermset.union(bigramset)
newtermset = newtermset.union(set(newlist))
managedNewTermSet = manager.dict(dict.fromkeys(newtermset, 0))
results = [pool.apply_async(searchRelation, args=(1, termset, word, nonexistant)) for word in newlist]

In [80]:
row_list = []
for key in graph:
    if(len(key) < 2):
        continue
    for rel in graph[key]:
        if(len(rel) < 2):
            continue
        #print(key)
        row_list.append([key, rel])
result = pd.DataFrame(row_list, columns=['from', 'to'])
result.to_csv('data/Networks/Wikipedia/wikipediaTerms.csv')

In [3]:
from IPython.display import clear_output
from multiprocessing import Manager, Pool
import wikipedia
import time
start_time = time.time()
from threading import RLock
lock = RLock()


def searchRelation(idx, terms, page, word, nonexistant, disambiguate = True):
    try:
        page = wikipedia.page(page)
        #This will move trough the pages linked to this one.
        g = dict()
        g[word] = set()
        for token in page.links:
            tLemma = lemmatizer(token, "NOUN")[0]
            if tLemma in terms:
                g[word].add(tLemma)
        lock.acquire()
        graph.update(g)
        clear_output()
        print(idx, "Current runtime: %.4f seconds"% (time.time() - start_time))
        lock.release()
    except wikipedia.DisambiguationError as e:
        #Ambiguous - need to try another
        if disambiguate == True:
            for option in e.options:
                searchRelation(idx, g, terms, option, word, nonexistant, False)
    except wikipedia.PageError as e:
        #No page with ID
        #An automated attempt would be to use some dictionary to try "related" words and have a score for them.
        nonexistant.append(word)
    except KeyError as e:
        #print(e)
        nonexistant.append(word)
    except ConnectionError as e:
        #print(e)
        nonexistant.append(word)
    
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
wikipedia.set_lang('pt')
terms = pd.read_csv('data/TreatedTerms/tfidfterms.csv')
questionterms = pd.read_csv('data/TreatedTerms/wordsFromQuestionsFiltered.csv')
bigrams = pd.read_csv('data/TreatedTerms/bigrams.csv')
trigrams = pd.read_csv('data/TreatedTerms/trigrams.csv')
qtermset = set(questionterms[questionterms.columns[1]])
bigramset = set(bigrams[bigrams.columns[1]])
trigramset = set(trigrams[trigrams.columns[1]])
terms = pd.concat([terms, bigrams], ignore_index=True)
terms = pd.concat([terms, trigrams], ignore_index=True)
termset = set(terms[terms.columns[1]])
termset = termset.union(trigramset)
termset = termset.union(bigramset)
termset = termset.union(qtermset)

#Parallel Data Structures 
manager = Manager()
#Ensure access to parallel modules, lock is used to deal with mess
global graph
graph = manager.dict()
termset = manager.dict(dict.fromkeys(termset, 0))
nonexistant = manager.list()

#Parallel!!
pool = Pool(processes=50)
results = [pool.apply_async(searchRelation, args=(idx, termset, row[1], row[1], nonexistant)) for idx, row in terms.iterrows()]

#print("Final Runtime: %.4f seconds --- Finished!" % (time.time() - start_time))

1139 Current runtime: 62.9103 seconds


In [5]:
row_list = []
for key in graph:
    if(len(key) < 2):
        continue
    for rel in graph[key]:
        if(len(rel) < 2):
            continue
        #print(key)
        row_list.append([key, rel])
result = pd.DataFrame(row_list, columns=['from', 'to'])
result.to_csv('data/Networks/Wikipedia/wikipediaTermsWithQuestionTerms.csv')

['instalaçõe', 'provado', 'identificar', 'inseminaçõe', 'aaveia', 'lactaçõe', 'secreçõe', 'alimentada', 'melhorar', 'essenciais', 'oideal', 'aglomeraçõe', 'substituir', 'ocorte', 'obter', 'apresentam', 'quais', 'raçõe', 'relacionado', 'tricross', 'envolvido', 'causada', 'sincronização', 'ordenhada', 'melhoria', 'devido', 'adistribuição', 'aplicar', 'orufião', 'revacinar', 'recomendaçõe', 'apresentar', 'degradada', 'sobreordenha', 'portanto', 'demora', 'cistos foliculares', 'quer dizer', 'polpa cítrica', 'cochos cobertos', 'reação inflamatória', 'digestíveis totais', 'quartos mamários', 'núcleo Moet', 'silos aéreos', 'adubo químico', 'desaleitamento precoce', 'reservas corporais', 'práticas agronômicas', 'dietas completas', 'abrigos individuais', 'valores genéticos', 'exame andrológico', 'agente causador', 'períodos prolongados', 'desmama precoce', 'dejeto líquido', 'seleção genômica', 'danos causados', 'ração concentrada', 'cocho coberto', 'mesma coisa', 'culturas anuais', 'estresse ca