In [1]:
from pyspark.sql import SparkSession
import os
import shutil
import re

spark = SparkSession.builder \
    .appName("ProyectoSparkie-Vocabulario") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 15:26:01 WARN Utils: Your hostname, DESKTOP-6OAF9F9, resolves to a loopback address: 127.0.1.1; using 172.28.59.214 instead (on interface eth0)
25/12/07 15:26:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 15:26:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Aquí cargamos los libros en un RDD

base_path = os.path.abspath("../data/raw")
rdd_libros = sc.wholeTextFiles(base_path)
print(f" Libros cargados: {rdd_libros.count()}")

                                                                                

 Libros cargados: 99


In [3]:
#Aquí quitamos las minusculas

rdd_lower = rdd_libros.map(lambda x: (x[0], x[1].lower()))

In [4]:
# Con esto limpiamos caracteres y simbolos innecesarios
def limpiar(texto):
    return re.sub(r"[^a-z0-9\s]", " ", texto)

rdd_clean = rdd_lower.map(lambda x: (x[0], limpiar(x[1])))


In [5]:
#Hacemos los tockens

rdd_tokens = rdd_clean.flatMap(
    lambda x: [(x[0], t) for t in x[1].split()]
)

In [6]:
# Guardamos la lista de las stopwords gracias a la librería NLTK. Lo instalamos en la carpeta del proyecto así que solo se necesita importar.

from nltk.corpus import stopwords
stop = set(stopwords.words("english"))
stop_b = sc.broadcast(stop)

In [7]:
# Usamos esa lista de stopwords para quitarlas
def es_valida(palabra):
    return palabra not in stop_b.value and len(palabra) > 2

rdd_filtrado = rdd_tokens.filter(lambda x: es_valida(x[1]))

In [8]:
#Extraer nombre
def docname(path):
    return os.path.basename(path)

rdd_doc_token = rdd_filtrado.map(lambda x: (docname(x[0]), x[1]))

In [9]:
# Vocabulario por documento:

rdd_vocab = rdd_doc_token.distinct()
print(f"- Primeros 10 términos del vocabulario:")
for item in rdd_vocab.take(10):
    print(item)
    
# Los guardamos en nuestra carpeta de processado:
output_vocab = "../data/processed/vocabulario_rdd"
if os.path.exists(output_vocab):
    shutil.rmtree(output_vocab)
rdd_vocab.saveAsTextFile(output_vocab)
print(f"- Vocabulario guardado en: {output_vocab}")

- Primeros 10 términos del vocabulario:


                                                                                

('Romeo_and_Juliet_by_William_Shakespeare.txt', 'juliet')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'prologue')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'act')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'street')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'capulet')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'house')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'hall')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'open')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'garden')
('Romeo_and_Juliet_by_William_Shakespeare.txt', 'lawrence')


                                                                                

- Vocabulario guardado en: ../data/processed/vocabulario_rdd


In [10]:
#Lo guardamos globalmente
vocab_global = rdd_doc_token.map(lambda x: x[1]).distinct()
print(f"\n- Vocabulario global (primeras 20 palabras):")
for palabra in vocab_global.take(20):
    print(palabra)

output_global = "../data/processed/vocabulario_unico_rdd"
if os.path.exists(output_global):
    shutil.rmtree(output_global)
vocab_global.saveAsTextFile(output_global)
print(f"- Vocabulario global guardado en: {output_global}")


- Vocabulario global (primeras 20 palabras):


                                                                                

tragedy
romeo
william
shakespeare
contents
scene
public
place
iii
room
chorus
adjoining
friar
overlooking
monument
belonging
capulets
dramatis
escalus
verona




- Vocabulario global guardado en: ../data/processed/vocabulario_unico_rdd


                                                                                

In [11]:
#Aquí calculamos las frecuencias:

rdd_pairs = rdd_doc_token.map(lambda x: ((x[0], x[1]), 1))
rdd_freq = rdd_pairs.reduceByKey(lambda a, b: a + b)

print(f"\n- Primeras 20 frecuencias:")
for item in rdd_freq.take(20):
    print(item)

output_freq = "../data/processed/frecuencias_rdd"
if os.path.exists(output_freq):
    shutil.rmtree(output_freq)
rdd_freq.saveAsTextFile(output_freq)
print(f"- Frecuencias guardadas en: {output_freq}")


- Primeras 20 frecuencias:


                                                                                

(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'juliet'), 190)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'prologue'), 3)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'act'), 13)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'street'), 10)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'capulet'), 163)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'house'), 33)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'hall'), 8)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'open'), 10)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'garden'), 8)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'lawrence'), 82)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'cell'), 22)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'gallery'), 2)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'chamber'), 13)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'bed'), 27)
(('Romeo_and_Juliet_by_William_Shakespeare.txt', 'mantua'), 16)
(('Romeo_and_Juliet_by_William_Shakespeare.txt',

                                                                                

- Frecuencias guardadas en: ../data/processed/frecuencias_rdd


In [12]:
sc.stop()