In [1]:
import math
from pyspark import SparkContext

In [2]:
sc = SparkContext(appName="flaflu")
sc

In [3]:
rdd = sc.sequenceFile("part-00000")
N_documentos = rdd.count()

In [4]:
DOC_COUNT_MIN = 10
DOC_COUNT_MAX = N_documentos * 0.7

In [5]:
def limpa_conteudo(conteudo):
    to_remove = ['!', '.', ',', ':', '@', '#', '$', '%', '/', '\\', '|', '´', '`',
                 '*', '&', '(', ')', '[', ']', '}', '{', '+', '-', '<', '>', '?', '°', '=', '"', 
                 '_', "'", ';', '^', '~', '¨']
    for i in to_remove:
        conteudo = conteudo.replace(i, ' ')
    return conteudo

def conta_documento(item):
    conteudo = limpa_conteudo(item[1])
    palavras = conteudo.strip().split()
    return [(i.lower(), 1) for i in set(palavras)]

def calcula_idf(item):
    palavra, contagem = item
    idf = math.log10(N_documentos/contagem)
    return (palavra, idf)

def filtra_doc(item):
    contagem = item[1]
    return (contagem < DOC_COUNT_MAX) and (contagem > DOC_COUNT_MIN) 

rdd_idf = rdd \
    .flatMap(conta_documento) \
    .reduceByKey(lambda x,y: x+y) \
    .filter(filtra_doc) \
    .map(lambda x: (x[0], math.log10(N_documentos/x[1])))

In [6]:
rdd.take(1)[0][0]

'https://sebodomessias.com.br/suas_compras.aspx?retirar=0&action=1&idItem=2023136'

In [7]:
def conta_palavra(item):
    conteudo = limpa_conteudo(item[1])
    palavras = conteudo.strip().split()
    return [(i.lower(), 1) for i in palavras]

def calcula_freq(item):
    palavra, contagem = item
    freq = math.log10(1 + contagem)
    return (palavra, freq)

rdd_freq_fla = rdd \
    .filter(lambda x: "flamengo" in x[1]) \
    .flatMap(conta_palavra) \
    .reduceByKey(lambda x,y: x+y) \
    .map(calcula_freq)

rdd_freq_flu = rdd \
    .filter(lambda x: "fluminense" in x[1]) \
    .flatMap(conta_palavra) \
    .reduceByKey(lambda x,y: x+y) \
    .map(calcula_freq)

rdd_freq = rdd_freq_fla.intersection(rdd_freq_flu)

In [8]:
rdd_relevancia = rdd_freq.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))

In [9]:
top_relevancia = rdd_relevancia.takeOrdered(100, key=lambda x: -x[1])

In [12]:
# import pandas as pd
# df = pd.DataFrame(top_relevancia, columns=['Palavra', 'Relevancia'])
# df.to_csv("teste.csv")

In [10]:
rdd_freq_flaOnly = rdd_freq_fla.subtractByKey(rdd_freq)
rdd_freq_fluOnly = rdd_freq_flu.subtractByKey(rdd_freq)

rdd_relevanciaFLA = rdd_freq_flaOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLA = rdd_relevanciaFLA.takeOrdered(100, key=lambda x: -x[1])


rdd_relevanciaFLU = rdd_freq_fluOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLU = rdd_relevanciaFLU.takeOrdered(100, key=lambda x: -x[1])


In [13]:
top_relevancia

[('pituaçu', 5.435292942683553),
 ('061', 3.674464476978498),
 ('021', 3.405110010963132),
 ('0711', 3.36961840547164),
 ('0614', 3.35560460164042),
 ('072', 3.3155444830412844),
 ('052', 3.2801423234298244),
 ('2714', 3.1291047257097606),
 ('291', 3.1275183996670086),
 ('031', 3.1013188496774258),
 ('042', 3.0773517307512783),
 ('251', 3.026218129529924),
 ('023', 3.023886208371118),
 ('1214', 3.0153719889167245),
 ('1412', 2.9991180524801253),
 ('1614', 2.9991180524801253),
 ('0213', 2.971796837656449),
 ('011', 2.9682951119052885),
 ('0513', 2.939861798913858),
 ('067', 2.9260176835771636),
 ('301', 2.9169588307346626),
 ('201', 2.9116329225730095),
 ('033', 2.884468390038659),
 ('0412', 2.88328519821178),
 ('2920', 2.88328519821178),
 ('022', 2.8588417183282977),
 ('0722', 2.8579633368629933),
 ('2513', 2.8579633368629933),
 ('0512', 2.8579633368629933),
 ('2214', 2.8342763174843264),
 ('232', 2.8271130646854528),
 ('0912', 2.8120257829744135),
 ('2612', 2.791047421022259),
 ('047'

In [13]:
with open("top100_intersection.txt", "w", encoding="utf-8") as file:
    for line in top_relevancia:
        file.write(f"{line[0]} : {line[1]}\n")

with open("top100_FLA.txt", "w", encoding="utf-8") as file:
    for line in top_relevanciaFLA:
        file.write(f"{line[0]} : {line[1]}\n")

with open("top100_FLU.txt", "w", encoding="utf-8") as file:
    for line in top_relevanciaFLU:
        file.write(f"{line[0]} : {line[1]}\n")