In [1]:
import math
from pyspark import SparkContext

In [2]:
sc = SparkContext(appName="pedroramos")
sc

In [3]:
rdd = sc.sequenceFile("part-00000")
N_documentos = rdd.count()

In [4]:
DOC_COUNT_MIN = 10
DOC_COUNT_MAX = N_documentos * 0.7

In [5]:
def limpa_conteudo(conteudo):
    to_remove = ['!', '.', ',', ':', '@', '#', '$', '%', '/', '\\', '|', '´', '`',
                 '*', '&', '(', ')', '[', ']', '}', '{', '+', '-', '<', '>', '?', '°', '=', '"', 
                 '_', "'", ';', '^', '~', '¨']
    for i in to_remove:
        conteudo = conteudo.replace(i, ' ')
    return conteudo

def conta_documento(item):
    conteudo = limpa_conteudo(item[1])
    palavras = conteudo.strip().split()
    return [(i.lower(), 1) for i in set(palavras)]

def calcula_idf(item):
    palavra, contagem = item
    idf = math.log10(N_documentos/contagem)
    return (palavra, idf)

def filtra_doc(item):
    contagem = item[1]
    return (contagem < DOC_COUNT_MAX) and (contagem > DOC_COUNT_MIN) 

rdd_idf = rdd \
    .flatMap(conta_documento) \
    .reduceByKey(lambda x,y: x+y) \
    .filter(filtra_doc) \
    .map(lambda x: (x[0], math.log10(N_documentos/x[1])))

In [6]:
def conta_palavra(item):
    conteudo = limpa_conteudo(item[1])
    palavras = conteudo.strip().split()
    return [(i.lower(), 1) for i in palavras]

def calcula_freq(item):
    palavra, contagem = item
    freq = math.log10(1 + contagem)
    return (palavra, freq)

rdd_freq_fla = rdd \
    .filter(lambda x: "flamengo" in x[0]) \
    .flatMap(conta_palavra) \
    .reduceByKey(lambda x,y: x+y) \
    .map(calcula_freq)

rdd_freq_flu = rdd \
    .filter(lambda x: "fluminense" in x[0]) \
    .flatMap(conta_palavra) \
    .reduceByKey(lambda x,y: x+y) \
    .map(calcula_freq)

rdd_freq = rdd_freq_fla.intersection(rdd_freq_flu)

In [7]:
rdd_relevancia = rdd_freq.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))

In [8]:
top_relevancia = rdd_relevancia.takeOrdered(100, key=lambda x: -x[1])

In [9]:
rdd_freq_flaOnly = rdd_freq_fla.subtractByKey(rdd_freq)
rdd_freq_fluOnly = rdd_freq_flu.subtractByKey(rdd_freq)

rdd_relevanciaFLA = rdd_freq_flaOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLA = rdd_relevanciaFLA.takeOrdered(100, key=lambda x: -x[1])


rdd_relevanciaFLU = rdd_freq_fluOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLU = rdd_relevanciaFLU.takeOrdered(100, key=lambda x: -x[1])


In [10]:
with open("top100_intersection.txt", "w", encoding="utf-8") as file:
    for line in top_relevancia:
        file.write(f"{line[0]} : {line[1]}\n")

with open("top100_FLA.txt", "w", encoding="utf-8") as file:
    for line in top_relevanciaFLA:
        file.write(f"{line[0]} : {line[1]}\n")

with open("top100_FLU.txt", "w", encoding="utf-8") as file:
    for line in top_relevanciaFLU:
        file.write(f"{line[0]} : {line[1]}\n")