In [1]:
import os

from pyspark import SparkContext
from tabulate import tabulate

In [2]:
sc = SparkContext("local", "Number RDD App")

In [3]:
os.listdir('./RawBooks')

['German2.txt',
 'Germana.txt',
 'HistoriaDeVenezuelaPt1.txt',
 'HistoriaDeVenezuelaPt2.txt',
 'HistoriaNaturalYMoralDeLasIndasPt2.txt',
 'HistoriaNaturalYMoralDeLasIndias.txt',
 'LaNarizDeUnNotario.txt',
 'MerodeadoresDeFronterastxt.txt',
 'Misticas-poesias.txt',
 'ResenaVeridicaDeLaRevolucionFilipina.txt',
 'Tragedias.txt']

In [4]:
file_handler = open('./RawBooks/German2.txt', 'r', encoding="utf-8")
book = file_handler.read()

In [5]:
common_spanish_nouns = [
    "tiempo",  # time
    "año",  # year
    "día",  # day
    "cosa",  # thing
    "persona",  # person
    "hombre",  # man
    "mujer",  # woman
    "vida",  # life
    "niño",  # child, boy
    "mundo",  # world
    "momento",  # moment
    "mano",  # hand
    "parte",  # part
    "casa",  # house
    "trabajo",  # work, job
    "nombre",  # name
    "palabra",  # word
    "lugar",  # place
    "amigo",  # friend
    "gente",  # people
]

In [6]:
common_spanish_adjectives = [
    "bueno",  # good
    "malo",  # bad
    "grande",  # big, large
    "pequeño",  # small
    "feliz",  # happy
    "triste",  # sad
    "fácil",  # easy
    "difícil",  # difficult
    "rápido",  # fast
    "lento",  # slow
    "caliente",  # hot
    "frío",  # cold
    "joven",  # young
    "viejo",  # old
    "bonito",  # pretty
    "feo",  # ugly
    "rico",  # rich, delicious
    "pobre",  # poor
    "nuevo",  # new
    "viejo",  # old
]

In [7]:
common_spanish_verbs = [
    "ser",
    "estar",
    "tener",
    "hacer",
    "poder",
    "decir",
    "ir",
    "ver",
    "dar",
    "saber",
    "querer",
    "llegar",
    "pasar",
    "deber",
    "poner",
    "parecer",
    "quedar",
    "creer",
    "hablar",
    "llevar"
]

In [8]:
def is_adjective(word):
    return word in common_spanish_adjectives

In [9]:
def is_verb(word):
    return word in common_spanish_verbs

In [10]:
def is_noun(word):
    return word in common_spanish_nouns

In [11]:
text_file_rdd = sc.textFile('./RawBooks/German2.txt')
words_rdd = text_file_rdd.flatMap(lambda line: line.split(" "))

clean_words_rdd = words_rdd.map(lambda word: word.lower().strip(".,:;()¡!¿?")).filter(lambda word: len(word) > 1)

indexed_words = clean_words_rdd.zipWithIndex()
current_words = indexed_words.map(lambda x: (x[1], x[0]))
next_words = indexed_words.map(lambda x: (x[1] - 1, x[0]))
next_next_words = indexed_words.map(lambda x: (x[1] - 2, x[0]))
word_trios = current_words.join(next_words).join(next_next_words)
word_trios = word_trios.map(lambda x: (x[1][0][0], x[1][0][1], x[1][1]))
word_trios_count = word_trios.map(lambda trio: (trio, 1)).reduceByKey(lambda a, b: a + b)
top_trios = word_trios_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(3)

verbs_found = clean_words_rdd.filter(is_verb)
verbs_counts_rdd = verbs_found.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
top_verbs = verbs_counts_rdd.sortBy(lambda verb_count: verb_count[1], ascending=False).take(3)

adjectives_found = clean_words_rdd.filter(is_adjective)
adjectives_counts_rdd = adjectives_found.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
top_adjectives = adjectives_counts_rdd.sortBy(lambda verb_count: verb_count[1], ascending=False).take(3)

nouns_found = clean_words_rdd.filter(is_noun)
nouns_counts_rdd = nouns_found.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
top_nouns = nouns_counts_rdd.sortBy(lambda verb_count: verb_count[1], ascending=False).take(3)

word_counts_rdd = clean_words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
top_words = word_counts_rdd.sortBy(lambda word_count: word_count[1], ascending=False).take(3)



In [12]:
results = []
for i in range(len(top_verbs)):
    results.append([
        'Germana',
        top_words[i][0],
        top_words[i][1],
        top_verbs[i][0],
        top_verbs[i][1],
        top_adjectives[i][0],
        top_adjectives[i][1],
        top_nouns[i][0],
        top_nouns[i][1],
        top_trios[i][1],
        top_trios[i][0],
    ])

In [13]:
print(tabulate(results, headers=["Libro", "Palabra", "Palabra (c)", "Verbo", "Verbo (c)",
                                 "Adjetivo", "Adjetivo (c)", "Sustantivo", "Sustantivo (c)",
                                 "Trios", "Trios (c)"]))

Libro    Palabra      Palabra (c)  Verbo      Verbo (c)  Adjetivo      Adjetivo (c)  Sustantivo      Sustantivo (c)  Trios                      Trios (c)
-------  ---------  -------------  -------  -----------  ----------  --------------  ------------  ----------------  -----------------------  -----------
Germana  de                    18  llevar             3  malo                     3  amigo                        3  ('era', 'era', 'era')              3
Germana  su                    15  ser                1  joven                    2  hombre                       2  ('era', 'era', 'un')               2
Germana  el                    12  decir              1  triste                   2  tiempo                       2  ('de', 'sus', 'éxitos')            2


In [14]:
sc.stop()