# Trabajo 2 Almacenamiento y Recuperación de la información
##Pablo Correa Morales
##Luis Fernando Posada
##Juan Pablo Leal
##Universidad EAFIT
##2022-1

In [3]:
from google.colab import drive
drive.mount('/content/gdrive') #linea para la creación de un directorio virtual dentro del Google Drive

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Librerias necesarias para el desarrollo:
#### 1. findspark nos ayuda a encontrar donde esta spark en el sistema.
#### 2. pyspark nos permite hacer uso de spark desde el lenguaje Python, donde haremos todo el procesamiento de texto a partir de unas librerias que nos provee este

In [4]:
# Instalar Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Descargar spark
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# Unzip a la descarga de spark
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# Actualizar las variables de entorno de JAVA_HOME y SPARK_HOME
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# instalar findspark
!pip install -q findspark



In [5]:
# Instalar la herramienta pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 31 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 41.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=efc1b8b2d21891df1321ba6a47825a5f5e7b676645b12ed1f7fcbc1fba584352
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [6]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

#Crear la sesion de Spark y su contexto.
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

Read the text files and create a dataFrame with the content and filenames

In [7]:
myrdd = sc.wholeTextFiles('/content/gdrive/MyDrive/datasets/wiki-multiple-files/*.txt')
df = myrdd.toDF(schema=['filename','content'])
df.show(5)

+--------------------+--------------------+
|            filename|             content|
+--------------------+--------------------+
|file:/content/gdr...|@@1514 Albert of ...|
|file:/content/gdr...|@@19514 # Events ...|
|file:/content/gdr...|@@185514 Spiritua...|
|file:/content/gdr...|@@5295514 narrato...|
|file:/content/gdr...|@@5297514 Lisy or...|
+--------------------+--------------------+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)



In [9]:
# Tokenization
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='content',outputCol='tokens')
tokenized_df=tokenization.transform(df)
tokenized_df.printSchema()
tokenized_df.show(5)
 

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+
|            filename|             content|              tokens|
+--------------------+--------------------+--------------------+
|file:/content/gdr...|@@1514 Albert of ...|[@@1514, albert, ...|
|file:/content/gdr...|@@19514 # Events ...|[@@19514, #, even...|
|file:/content/gdr...|@@185514 Spiritua...|[@@185514, spirit...|
|file:/content/gdr...|@@5295514 narrato...|[@@5295514, narra...|
|file:/content/gdr...|@@5297514 Lisy or...|[@@5297514, lisy,...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
# Remover los stopwords
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)
refined_df.select(['tokens','refined_tokens']).show(20,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:

refined_df.columns

['filename', 'content', 'tokens', 'refined_tokens']

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

Conteo de los tokens 

In [13]:
len_udf = udf(lambda s: len(s), IntegerType())

refined_count_df = refined_df.withColumn("token_count", len_udf(col('refined_tokens')))

In [14]:
refined_count_df.orderBy(rand()).show(20)

+--------------------+--------------------+--------------------+--------------------+-----------+
|            filename|             content|              tokens|      refined_tokens|token_count|
+--------------------+--------------------+--------------------+--------------------+-----------+
|file:/content/gdr...|@@24962514 This i...|[@@24962514, this...|[@@24962514, comp...|       6428|
|file:/content/gdr...|@@2167514 The Nat...|[@@2167514, the, ...|[@@2167514, natio...|        121|
|file:/content/gdr...|@@22176514 Khadij...|[@@22176514, khad...|[@@22176514, khad...|        170|
|file:/content/gdr...|@@37157514 Mng Dn...|[@@37157514, mng,...|[@@37157514, mng,...|        112|
|file:/content/gdr...|@@34351514 weathe...|[@@34351514, weat...|[@@34351514, weat...|        193|
|file:/content/gdr...|@@24506514 Produc...|[@@24506514, prod...|[@@24506514, prod...|       1300|
|file:/content/gdr...|@@10135514 Leandr...|[@@10135514, lean...|[@@10135514, lean...|        898|
|file:/content/gdr..

In [15]:
#Calculo del BoW (Bag of Words)

from pyspark.ml.feature import CountVectorizer
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')
cv_df=count_vec.fit(refined_df).transform(refined_df)
cv_df.select(['refined_tokens','features']).show(20,False)
bow = count_vec.fit(refined_df).vocabulary
print(bow)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
from pyspark.ml.feature import HashingTF
l = len(bow)
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features',numFeatures=l)

hashing_df=hashing_vec.transform(refined_df)
hashing_df.show(20)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            filename|             content|              tokens|      refined_tokens|         tf_features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|file:/content/gdr...|@@1514 Albert of ...|[@@1514, albert, ...|[@@1514, albert, ...|(130199,[488,548,...|
|file:/content/gdr...|@@19514 # Events ...|[@@19514, #, even...|[@@19514, #, even...|(130199,[45,279,3...|
|file:/content/gdr...|@@185514 Spiritua...|[@@185514, spirit...|[@@185514, spirit...|(130199,[76,80,24...|
|file:/content/gdr...|@@5295514 narrato...|[@@5295514, narra...|[@@5295514, narra...|(130199,[178,519,...|
|file:/content/gdr...|@@5297514 Lisy or...|[@@5297514, lisy,...|[@@5297514, lisy,...|(130199,[8603,863...|
|file:/content/gdr...|@@5299514 Indian ...|[@@5299514, india...|[@@5299514, india...|(130199,[1882,475...|
|file:/content/gdr...|@@5307514 There

In [17]:
#Caulculo del TF-IDF
from pyspark.ml.feature import IDF
tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')
tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.show(10,False)

+------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------