In [1]:
import pandas as pd
import math
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(appName="pedroramos")
sc

In [3]:
rdd = sc.sequenceFile("part-00000")
N_documentos = rdd.count()

In [4]:
DOC_COUNT_MIN = 10
DOC_COUNT_MAX = N_documentos * 0.7

In [5]:
def conta_documento(item):
    conteudo = item[1]
    palavras = conteudo.strip().split()
    return [(i.lower(), 1) for i in set(palavras)]

def calcula_idf(item):
    palavra, contagem = item
    idf = math.log10(N_documentos/contagem)
    return (palavra, idf)

def filtra_doc(item):
    contagem = item[1]
    return (contagem < DOC_COUNT_MAX) and (contagem > DOC_COUNT_MIN) 

rdd_idf = rdd \
    .flatMap(conta_documento) \
    .reduceByKey(lambda x,y: x+y) \
    .filter(filtra_doc) \
    .map(lambda x: (x[0], math.log10(N_documentos/x[1])))

In [6]:
def conta_palavra(item):
    conteudo = item[1]
    palavras = conteudo.strip().split()
    return [(i.lower(), 1) for i in palavras]

def calcula_freq(item):
    palavra, contagem = item
    freq = math.log10(1 + contagem)
    return (palavra, freq)

rdd_freq_fla = rdd \
    .filter(lambda x: "flamengo" in x[0]) \
    .flatMap(conta_palavra) \
    .reduceByKey(lambda x,y: x+y) \
    .map(calcula_freq)

rdd_freq_bot = rdd \
    .filter(lambda x: "botafogo" in x[0]) \
    .flatMap(conta_palavra) \
    .reduceByKey(lambda x,y: x+y) \
    .map(calcula_freq)

rdd_freq = rdd_freq_fla.intersection(rdd_freq_bot)

In [7]:
rdd_relevancia = rdd_freq.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))

In [8]:
top_relevancia = rdd_relevancia.takeOrdered(100, key=lambda x: -x[1])
with open("top100_intersection.txt", "w") as file:
    for line in top_relevancia:
        file.write(f"{line[0]} : {line[1]}\n")

In [None]:
rdd_freq_flaOnly = rdd_freq_fla.subtractByKey(rdd_freq)
rdd_freq_botOnly = rdd_freq_bot.subtractByKey(rdd_freq)

rdd_relevanciaFLA = rdd_freq_flaOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLA = rdd_relevanciaFLA.takeOrdered(100, key=lambda x: -x[1])
with open("top100_FLA.txt", "w") as file:
    for line in top_relevancia:
        file.write(f"{line[0]} : {line[1]}\n")

rdd_relevanciaBOT = rdd_freq_botOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaBOT = rdd_relevanciaBOT.takeOrdered(100, key=lambda x: -x[1])
with open("top100_BOT.txt", "w") as file:
    for line in top_relevancia:
        file.write(f"{line[0]} : {line[1]}\n")