Activity done by Adrian and Julian

In [14]:
import os

import pandas as pd
from pyspark import SparkContext

In [15]:
sc = SparkContext("local", "BookProject RDD App")

In [16]:
common_spanish_nouns = [
    "tiempo",
    "año",
    "día",
    "cosa",
    "persona",
    "hombre",
    "mujer",
    "vida",
    "niño",
    "mundo",
    "momento",
    "mano",
    "parte",
    "casa",
    "trabajo",
    "nombre",
    "palabra",
    "lugar",
    "amigo",
    "gente",
]

In [17]:
common_spanish_adjectives = [
    "bueno",
    "malo",
    "grande",
    "pequeño",
    "feliz",
    "triste",
    "fácil",
    "difícil",
    "rápido",
    "lento",
    "caliente",
    "frío",
    "joven",
    "viejo",
    "bonito",
    "feo",
    "rico",
    "pobre",
    "nuevo",
    "viejo",
]

In [18]:
common_spanish_verbs = [
    "ser",
    "estar",
    "tener",
    "hacer",
    "poder",
    "decir",
    "ir",
    "ver",
    "dar",
    "saber",
    "querer",
    "llegar",
    "pasar",
    "deber",
    "poner",
    "parecer",
    "quedar",
    "creer",
    "hablar",
    "llevar"
]

In [19]:
def is_adjective(word):
    return word in common_spanish_adjectives

In [20]:
def is_verb(word):
    return word in common_spanish_verbs

In [21]:
def is_noun(word):
    return word in common_spanish_nouns

In [22]:
book_list = os.listdir('./RawBooks')
book_list

['Germana.txt',
 'HistoriaDeVenezuelaPt1.txt',
 'HistoriaDeVenezuelaPt2.txt',
 'HistoriaNaturalYMoralDeLasIndasPt2.txt',
 'HistoriaNaturalYMoralDeLasIndias.txt',
 'LaNarizDeUnNotario.txt',
 'MerodeadoresDeFronterastxt.txt',
 'Misticas-poesias.txt',
 'ResenaVeridicaDeLaRevolucionFilipina.txt',
 'Tragedias.txt']

In [23]:
results = []

for book in book_list:
    text_file_rdd = sc.textFile('./RawBooks/' + book)
    words_rdd = text_file_rdd.flatMap(lambda line: line.split(" "))

    clean_words_rdd = words_rdd.map(lambda word: word.lower().strip(".,:;()¡!¿?")).filter(lambda word: len(word) > 1)

    indexed_words = clean_words_rdd.zipWithIndex()
    current_words = indexed_words.map(lambda x: (x[1], x[0]))
    next_words = indexed_words.map(lambda x: (x[1] - 1, x[0]))
    next_next_words = indexed_words.map(lambda x: (x[1] - 2, x[0]))
    word_trios = current_words.join(next_words).join(next_next_words)
    word_trios = word_trios.map(lambda x: (x[1][0][0], x[1][0][1], x[1][1]))
    word_trios_count = word_trios.map(lambda trio: (trio, 1)).reduceByKey(lambda a, b: a + b)
    top_trios = word_trios_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(3)

    verbs_found = clean_words_rdd.filter(is_verb)
    verbs_counts_rdd = verbs_found.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
    top_verbs = verbs_counts_rdd.sortBy(lambda verb_count: verb_count[1], ascending=False).take(3)

    adjectives_found = clean_words_rdd.filter(is_adjective)
    adjectives_counts_rdd = adjectives_found.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
    top_adjectives = adjectives_counts_rdd.sortBy(lambda verb_count: verb_count[1], ascending=False).take(3)

    nouns_found = clean_words_rdd.filter(is_noun)
    nouns_counts_rdd = nouns_found.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
    top_nouns = nouns_counts_rdd.sortBy(lambda verb_count: verb_count[1], ascending=False).take(3)

    word_counts_rdd = clean_words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
    top_words = word_counts_rdd.sortBy(lambda word_count: word_count[1], ascending=False).take(3)

    for i in range(len(top_verbs)):
        results.append([
            book.replace(".txt", ''),
            i + 1,
            top_words[i][0],
            top_words[i][1],
            top_verbs[i][0],
            top_verbs[i][1],
            top_adjectives[i][0],
            top_adjectives[i][1],
            top_nouns[i][0],
            top_nouns[i][1],
            top_trios[i][1],
            top_trios[i][0],
        ])

In [24]:
df = pd.DataFrame(results, columns=["Libro", "Ranking", "Palabra", "Palabra (c)", "Verbo", "Verbo (c)",
                                    "Adjetivo", "Adjetivo (c)", "Sustantivo", "Sustantivo (c)",
                                    "Trios", "Trios (c)"])

In [25]:
df

Unnamed: 0,Libro,Ranking,Palabra,Palabra (c),Verbo,Verbo (c),Adjetivo,Adjetivo (c),Sustantivo,Sustantivo (c),Trios,Trios (c)
0,Germana,1,de,3560,ser,60,pobre,63,casa,145,"(la, señora, chermidy)",149
1,Germana,2,la,2704,hacer,55,joven,57,mujer,117,"(la, tour, de)",63
2,Germana,3,que,2178,ver,54,viejo,48,hombre,111,"(tour, de, embleuse)",62
3,HistoriaDeVenezuelaPt1,1,de,12059,ser,375,grande,84,gente,866,"(la, edición, de)",338
4,HistoriaDeVenezuelaPt1,2,que,8760,dar,261,nuevo,34,parte,403,"(edición, de, caracas)",337
5,HistoriaDeVenezuelaPt1,3,la,5484,tener,128,pequeño,23,tiempo,232,"(en, la, edición)",336
6,HistoriaDeVenezuelaPt2,1,de,8770,ser,272,grande,81,parte,208,"(lope, de, aguirre)",390
7,HistoriaDeVenezuelaPt2,2,que,6755,dar,204,nuevo,19,gente,186,"(edición, de, caracas)",285
8,HistoriaDeVenezuelaPt2,3,la,3676,decir,121,bueno,15,cosa,151,"(la, edición, de)",285
9,HistoriaNaturalYMoralDeLasIndasPt2,1,de,4913,ser,135,grande,96,gente,150,"(de, los, indios)",48


In [26]:
sc.stop()