# **Proyecto de NLP, Spark y Redes Neuronales con Python**

-------

## Importación de librerías, inicio de sesión en Spark y lectura de los datos.

In [78]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StopWordsRemover, Tokenizer, CountVectorizer, NGram
from nltk.corpus import stopwords
import findspark
import seaborn as sns

In [5]:
findspark.init()
spark = SparkSession.builder.appName("ProyectoNLPSpark").getOrCreate()


In [22]:
data = spark.read.csv('../data/ruddit_comments_score.csv', header=True, inferSchema=True, sep = ",", multiLine=True)
data = data.withColumnRenamed("comment_id", "ID").withColumnRenamed("body", "Comentario").withColumnRenamed("score", "Puntuacion")

> ##### Estos son los datos con los que vamos a trabajar

In [23]:
data.show()

+--------------------+--------------------+----------+
|                  ID|          Comentario|Puntuacion|
+--------------------+--------------------+----------+
|             cza1q49|> The difference ...|    -0.083|
|             cza1wdh|"The myth is that...|    -0.022|
|             cza23qx|           [deleted]|     0.167|
|             cza2bw8|The assertion is ...|    -0.146|
|             cza2iji|You said in the O...|    -0.083|
|             cza2jj3|">Men and women a...|      null|
|Edit: Changed 70 ...|              -0.042|      null|
|             cza31e2|> All the wage ga...|    -0.021|
|             cza321d|           [deleted]|    -0.021|
|             cza336e|           [deleted]|     0.208|
|             cza34dq|           [deleted]|    -0.191|
|             cza3500|           [deleted]|    -0.229|
|             cza37ue|No, the point is ...|    -0.174|
|             cza3802|           [deleted]|     0.021|
|             cza392y|So women are paid...|    -0.229|
|         

In [24]:
data.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Comentario: string (nullable = true)
 |-- Puntuacion: string (nullable = true)



##### Limpieza y transformación y análisis de los datos

In [28]:
data = data.withColumn("Puntuacion", data["Puntuacion"].cast("float"))
data.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Comentario: string (nullable = true)
 |-- Puntuacion: float (nullable = true)



In [31]:
data = data.dropna()
data = data.filter(data["Comentario"] != "[deleted]")
data.show()

+-------+--------------------+----------+
|     ID|          Comentario|Puntuacion|
+-------+--------------------+----------+
|cza1q49|> The difference ...|    -0.083|
|cza1wdh|"The myth is that...|    -0.022|
|cza2bw8|The assertion is ...|    -0.146|
|cza2iji|You said in the O...|    -0.083|
|cza31e2|> All the wage ga...|    -0.021|
|cza37ue|No, the point is ...|    -0.174|
|cza392y|So women are paid...|    -0.229|
|cza3m1b|But obviously tha...|       0.0|
|cza3r5u|"I think that Hol...|     0.098|
|cza47sd|"> I don't think ...|    -0.083|
|cza47xu|I don't think the...|    -0.062|
|cza4d2a|> Women are a who...|    -0.062|
|cza4gsv|"The gist of my p...|    -0.021|
|cza4ldq|Biological differ...|    -0.083|
|cza5maz|> It's the differ...|    -0.188|
|cza6q74|>The fact of the ...|     0.083|
|cza6wrd|Well, if your wif...|    -0.104|
|cza76eq|Women have not sp...|     0.188|
|cza79u4|Doesn't it also m...|     0.175|
|cza7gpu|So you do believe...|       0.0|
+-------+--------------------+----

In [32]:
data.collect()[1][1]

'"The myth is that the ""gap"" is entirely based on the sex of the  person. "'

In [35]:
data = data.rdd.map(lambda x: (x[0],x[1][1:] if (x[1][0] == '"') else x[1],x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1][:-1] if (x[1][-1] == '"') else x[1],x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1][1:] if (x[1][0] == '>') else x[1],x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1][1:] if (x[1][0] == ' ') else x[1],x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1][:-1] if (x[1][-1] == ' ') else x[1],x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1][:-1] if (x[1][-1] == '.') else x[1],x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1].replace('""', '"'),x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1].replace('\n', ''),x[2])).toDF()
data = data.rdd.map(lambda x: (x[0],x[1].replace('\t', ''),x[2])).toDF()
data.show(truncate = False)

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|_1     |_2                                                                                                                                                                                                                                                                               

In [36]:
data.collect()[1][1]

'The myth is that the "gap" is entirely based on the sex of the  person'

In [37]:
data = data.withColumnRenamed("_1", "ID").withColumnRenamed("_2", "Comentario").withColumnRenamed("_3", "Puntuacion")

In [38]:
data.count()

5422

In [43]:
data = data.withColumn("LongitudLetras", length(data["Comentario"]))

In [44]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [45]:
data = Tokenizer(inputCol="Comentario", outputCol="ComenToken").transform(data).drop("Comentario")

In [46]:
data = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="ComenToken", outputCol="ComenTokenLimpio").transform(data).drop("ComenToken")

In [47]:
data = CountVectorizer(inputCol="ComenTokenLimpio", outputCol="ConteoPalabras").fit(data).transform(data)

In [48]:
data.show()

+-------+--------------------+--------------+--------------------+--------------------+
|     ID|          Puntuacion|LongitudLetras|    ComenTokenLimpio|      ConteoPalabras|
+-------+--------------------+--------------+--------------------+--------------------+
|cza1q49|-0.08299999684095383|           171|[difference, aver...|(23213,[36,50,68,...|
|cza1wdh|-0.02199999988079071|            70|[myth, "gap", ent...|(23213,[0,33,59,1...|
|cza2bw8| -0.1459999978542328|           115|[assertion, women...|(23213,[4,36,56,2...|
|cza2iji|-0.08299999684095383|           160|[said, op, that's...|(23213,[10,25,36,...|
|cza31e2|-0.02099999971687...|           476|[wage, gap, is,, ...|(23213,[1,5,12,25...|
|cza37ue|-0.17399999499320984|            62|[no,, point, talk...|(23213,[61,167,31...|
|cza392y| -0.2290000021457672|            40|[women, paid, les...|(23213,[36,56,274...|
|cza3m1b|                 0.0|           377|[obviously, make,...|(23213,[1,8,10,11...|
|cza3r5u| 0.09799999743700027|  

In [77]:
data.select([max(data["Puntuacion"]), min(data["Puntuacion"])]).show()

+------------------+-------------------+
|   max(Puntuacion)|    min(Puntuacion)|
+------------------+-------------------+
|0.9789999723434448|-0.8889999985694885|
+------------------+-------------------+



In [61]:
data.select([max(data["LongitudLetras"]), min(data["LongitudLetras"])]).show()

+-------------------+-------------------+
|max(LongitudLetras)|min(LongitudLetras)|
+-------------------+-------------------+
|                913|                  9|
+-------------------+-------------------+



In [80]:
data.select("ConteoPalabras").show()

+--------------------+
|      ConteoPalabras|
+--------------------+
|(23213,[36,50,68,...|
|(23213,[0,33,59,1...|
|(23213,[4,36,56,2...|
|(23213,[10,25,36,...|
|(23213,[1,5,12,25...|
|(23213,[61,167,31...|
|(23213,[36,56,274...|
|(23213,[1,8,10,11...|
|(23213,[5,38,40,7...|
|(23213,[1,5,12,26...|
|(23213,[3,5,12,14...|
|(23213,[12,20,36,...|
|(23213,[10,129,14...|
|(23213,[13,22,93,...|
|(23213,[54,56,66,...|
|(23213,[11,12,13,...|
|(23213,[10,12,18,...|
|(23213,[3,20,36,3...|
|(23213,[11,16,26,...|
|(23213,[0,2,36,64...|
+--------------------+
only showing top 20 rows



In [None]:
ngramas = NGram(n = )

In [49]:
# data.rdd.map(lambda x: (x[0],x[1],x[2],x[3],len(x[3]), x[4])).toDF()

+-------+--------------------+---+--------------------+---+--------------------+
|     _1|                  _2| _3|                  _4| _5|                  _6|
+-------+--------------------+---+--------------------+---+--------------------+
|cza1q49|-0.08299999684095383|171|[difference, aver...| 13|(23213,[36,50,68,...|
|cza1wdh|-0.02199999988079071| 70|[myth, "gap", ent...|  7|(23213,[0,33,59,1...|
|cza2bw8| -0.1459999978542328|115|[assertion, women...| 12|(23213,[4,36,56,2...|
|cza2iji|-0.08299999684095383|160|[said, op, that's...| 15|(23213,[10,25,36,...|
|cza31e2|-0.02099999971687...|476|[wage, gap, is,, ...| 41|(23213,[1,5,12,25...|
|cza37ue|-0.17399999499320984| 62|[no,, point, talk...|  4|(23213,[61,167,31...|
|cza392y| -0.2290000021457672| 40|[women, paid, les...|  4|(23213,[36,56,274...|
|cza3m1b|                 0.0|377|[obviously, make,...| 35|(23213,[1,8,10,11...|
|cza3r5u| 0.09799999743700027|249|[think, hollywood...| 24|(23213,[5,38,40,7...|
|cza47sd|-0.0829999968409538