In [1]:
import math
from pyspark import SparkContext

In [2]:
sc = SparkContext(appName="flaflu")
sc

In [3]:
rdd = sc.sequenceFile("part-00000")
N_documentos = rdd.count()

In [4]:
DOC_COUNT_MIN = 10
DOC_COUNT_MAX = N_documentos * 0.5

In [5]:
def conta_documento(item):
    conteudo = item[1]
    palavras = conteudo.strip().split()
    palavras_ = [i for i in palavras if i.isalpha()]
    palavras_filtradas = [i for i in palavras_ if len(i) > 3]
    return [(i.lower(), 1) for i in set(palavras_filtradas)]


def calcula_idf(item):
    palavra, contagem = item
    idf = math.log10(N_docs / contagem)
    return (palavra, idf)


def filtra_doc(item):
    contagem = item[1]
    return (contagem < DOC_COUNT_MAX) and (contagem > DOC_COUNT_MIN)

rdd_idf = rdd \
    .flatMap(conta_documento) \
    .reduceByKey(lambda x,y: x+y) \
    .filter(filtra_doc) \
    .map(lambda x: (x[0], math.log10(N_documentos/x[1])))

In [6]:
rdd_idf.takeOrdered(20, key=lambda x: -x[1])

[('russians', 3.516511335899341),
 ('jora', 3.516511335899341),
 ('betway', 3.516511335899341),
 ('unemployed', 3.516511335899341),
 ('millionaire', 3.516511335899341),
 ('desempenhadas', 3.516511335899341),
 ('optativas', 3.516511335899341),
 ('mock', 3.516511335899341),
 ('suspended', 3.516511335899341),
 ('doghero', 3.516511335899341),
 ('esperidião', 3.516511335899341),
 ('exercidos', 3.516511335899341),
 ('coorte', 3.516511335899341),
 ('sebastián', 3.516511335899341),
 ('frisando', 3.516511335899341),
 ('ende', 3.516511335899341),
 ('absolve', 3.516511335899341),
 ('homeopáticas', 3.516511335899341),
 ('conduite', 3.516511335899341),
 ('researchgate', 3.516511335899341)]

In [7]:
def conta_palavra(item):
    conteudo = item[1]
    palavras = conteudo.strip().split()
    palavras_ = [i for i in palavras if i.isalpha()]
    palavras_filtradas = [i for i in palavras_ if len(i) > 3]
    return [(i.lower(), 1) for i in palavras_filtradas]


def calcula_freq(item):
    palavra, contagem = item
    freq = math.log10(1 + contagem)
    return (palavra, freq)

def gera_rdd_freq(rdd, palavra):
    rdd_freq = (
        rdd.filter(lambda x: palavra in x[1])
        .flatMap(conta_palavra)
        .reduceByKey(lambda x, y: x + y)
        .map(calcula_freq)
    )

    return rdd_freq

rdd_freq_fla = gera_rdd_freq(rdd, "flamengo")
rdd_freq_flu = gera_rdd_freq(rdd, "fluminense")

rdd_freq = rdd_freq_fla.intersection(rdd_freq_flu)

In [8]:
print(rdd_freq_fla.takeOrdered(20, key=lambda x: -x[1]))
print('\n')
print(rdd_freq_flu.takeOrdered(20, key=lambda x: -x[1]))
print('\n')
print(rdd_freq.takeOrdered(20, key=lambda x: -x[1]))

[('bairro', 3.472171146692363), ('jardim', 3.370883016777606), ('vila', 3.3220124385824006), ('belo', 3.1832698436828046), ('horizonte', 3.1749315935284423), ('santa', 3.0310042813635367), ('paulo', 3.0038911662369103), ('contagem', 2.997386384397313), ('nova', 2.9916690073799486), ('betim', 2.9508514588885464), ('para', 2.9304395947667), ('santo', 2.705007959333336), ('parque', 2.6893088591236203), ('josé', 2.611723308007342), ('lima', 2.5658478186735176), ('mais', 2.5550944485783194), ('sorocaba', 2.5538830266438746), ('centro', 2.534026106056135), ('andré', 2.514547752660286), ('luzia', 2.4899584794248346)]


[('para', 3.3914644118391033), ('mais', 2.8692317197309762), ('como', 2.7867514221455614), ('brasil', 2.754348335711019), ('sobre', 2.6180480967120925), ('conteúdo', 2.5428254269591797), ('tags', 2.510545010206612), ('educação', 2.503790683057181), ('estadão', 2.481442628502305), ('festa', 2.481442628502305), ('detalhes', 2.4727564493172123), ('saúde', 2.4712917110589387), ('pe

In [9]:
rdd_relevancia = rdd_freq.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))

In [10]:
top_relevancia = rdd_relevancia.takeOrdered(100, key=lambda x: -x[1])
print(top_relevancia)

[('pituaçu', 5.502457231963849), ('ceni', 2.771780113056083), ('misael', 2.5915289103571633), ('joesley', 2.384729075294114), ('interiores', 2.3547194315566156), ('marcos', 2.273275009547502), ('sampaoli', 2.2475250063266974), ('marta', 2.236881180824885), ('loures', 2.2340312819160704), ('vino', 2.153374743545537), ('cpmf', 2.1434211037558506), ('vanderlan', 2.1434211037558506), ('urubu', 2.1244424982210126), ('araci', 2.1065809457071443), ('selecao', 2.094399803754871), ('neymar', 2.093886743804431), ('suzano', 2.0317265542586056), ('henrique', 2.031533362048021), ('cargo', 2.0308876440476933), ('wilder', 2.0302922354398523), ('iris', 2.025307892330542), ('ignorou', 1.9883823018548772), ('décimo', 1.9870735299349565), ('cesar', 1.9606051427560756), ('elétricas', 1.939375578749554), ('renuncia', 1.9379461908430653), ('aperibé', 1.9281531217082202), ('prejudicar', 1.879033465167268), ('delação', 1.8672377630080959), ('tendência', 1.8587284603939462), ('sánchez', 1.8548160455847345), ('

In [10]:
rdd_freq_flaOnly = rdd_freq_fla.subtractByKey(rdd_freq)
rdd_freq_fluOnly = rdd_freq_flu.subtractByKey(rdd_freq)

rdd_relevanciaFLA = rdd_freq_flaOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLA = rdd_relevanciaFLA.takeOrdered(100, key=lambda x: -x[1])


rdd_relevanciaFLU = rdd_freq_fluOnly.join(rdd_idf) \
    .map(lambda x: (x[0], x[1][0] * x[1][1]))
top_relevanciaFLU = rdd_relevanciaFLU.takeOrdered(100, key=lambda x: -x[1])
