# Analisis de sentimientos 

##  Importaciones de bibliotecas y librerias

In [268]:
import pandas as pd
import re 
import numpy
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import * 
from pyspark.sql.functions import length
from pyspark.sql.functions import udf
from pyspark.sql import functions as f
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import col, when
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Ingesta de datos de hdsf en dataframes

In [219]:
spark=SparkSession.builder.appName('mmarulandc').getOrCreate()
csv = '/user/mmarulandc/datasets/articles3.csv'
df1 = spark.read.csv(csv)
df1.show()

+------+------+--------------------+-----------+-----------------+----------+------+-----+--------------------+--------------------+
|   _c0|   _c1|                 _c2|        _c3|              _c4|       _c5|   _c6|  _c7|                 _c8|                 _c9|
+------+------+--------------------+-----------+-----------------+----------+------+-----+--------------------+--------------------+
|  null|    id|               title|publication|           author|      date|  year|month|                 url|             content|
|103459|151908|Alton Sterling’s ...|   Guardian|   Jessica Glenza|2016-07-13|2016.0|  7.0|https://www.thegu...|The son of a Loui...|
|103460|151909|Shakespeare’s fir...|   Guardian|             null|2016-05-25|2016.0|  5.0|https://www.thegu...|Copies of William...|
|103461|151910|My grandmother’s ...|   Guardian|    Robert Pendry|2016-10-31|2016.0| 10.0|https://www.thegu...|Debt: $20, 000, S...|
|103462|151911|I feared my life ...|   Guardian|   Bradford Frost|201

Definicion de expresiones regulares para la limpieza de los contenidos de las diferentes publicaciones 


In [220]:
reg = '[^a-zA-Z ]'
reg1 = '[\s+]{2,}'

# Limpieza del DataFrame

Creando un dataframe que contenga los contedidos de las publicaciones hechas y limpiando el contenido de caracteres especiales.

In [221]:
df_1 = df1.withColumn("clean", regexp_replace('_c9', reg ,""))
df_1.select('clean').show()

+--------------------+
|               clean|
+--------------------+
|             content|
|The son of a Loui...|
|Copies of William...|
|Debt   Source Col...|
|It was late I was...|
|A central Texas m...|
|I have been battl...|
|Three flatmates o...|
| Most people take...|
|The FBI has arres...|
|Donald Trump anno...|
|Most Olympic rowe...|
|Name Pamela Ander...|
|From the moment i...|
|A resolution to e...|
|Better Things ope...|
|Efforts are brewi...|
|Halldora was born...|
|As Hillary Clinto...|
|Barack Obamas ann...|
+--------------------+
only showing top 20 rows



En las siguientes lineas se eliminan las espacios en blanco(\s) de mas que son tokenisados como tokens independientes

In [222]:
df_2 = df_1.withColumn("clean1", regexp_replace('clean', reg1 ," "))
df_2.select('clean1').show()


+--------------------+
|              clean1|
+--------------------+
|             content|
|The son of a Loui...|
|Copies of William...|
|Debt Source Colle...|
|It was late I was...|
|A central Texas m...|
|I have been battl...|
|Three flatmates o...|
| Most people take...|
|The FBI has arres...|
|Donald Trump anno...|
|Most Olympic rowe...|
|Name Pamela Ander...|
|From the moment i...|
|A resolution to e...|
|Better Things ope...|
|Efforts are brewi...|
|Halldora was born...|
|As Hillary Clinto...|
|Barack Obamas ann...|
+--------------------+
only showing top 20 rows



## Tokenización de los contenidos de las publicaciones

Creacion de un dataframe con el contenido de la publicacion tokenizado 

In [223]:
tokenized_df=tokenization.transform(df_2)


In [224]:
tokenization=Tokenizer(inputCol='clean1',outputCol='tokens')
# tokenization=Tokenizer(inputCol='_c9',outputCol='tokens')
#tokenized_df=tokenization.transform(df_2)

In [225]:
tokenized_df.select('tokens').show()

+--------------------+
|              tokens|
+--------------------+
|           [content]|
|[the, son, of, a,...|
|[copies, of, will...|
|[debt, source, co...|
|[it, was, late, i...|
|[a, central, texa...|
|[i, have, been, b...|
|[three, flatmates...|
|[, most, people, ...|
|[the, fbi, has, a...|
|[donald, trump, a...|
|[most, olympic, r...|
|[name, pamela, an...|
|[from, the, momen...|
|[a, resolution, t...|
|[better, things, ...|
|[efforts, are, br...|
|[halldora, was, b...|
|[as, hillary, cli...|
|[barack, obamas, ...|
+--------------------+
only showing top 20 rows



## Remoción de stopWords

Eliminacion de stopWord de los contenidos de las publicaciones token tales como "I, and .or" 

In [226]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [227]:
refined_df=stopword_removal.transform(tokenized_df)

In [228]:
refined_df.select(['refined_tokens']).show()

+--------------------+
|      refined_tokens|
+--------------------+
|           [content]|
|[son, louisiana, ...|
|[copies, william,...|
|[debt, source, co...|
|[late, drunk, nea...|
|[central, texas, ...|
|[battling, depres...|
|[three, flatmates...|
|[, people, take, ...|
|[fbi, arrested, n...|
|[donald, trump, a...|
|[olympic, rowers,...|
|[name, pamela, an...|
|[moment, may, don...|
|[resolution, end,...|
|[better, things, ...|
|[efforts, brewing...|
|[halldora, born, ...|
|[hillary, clinton...|
|[barack, obamas, ...|
+--------------------+
only showing top 20 rows



In [232]:
refined_data = refined_df.dropna()
refined_data.select('_c9','clean','clean1','refined_tokens').show()


+--------------------+--------------------+--------------------+--------------------+
|                 _c9|               clean|              clean1|      refined_tokens|
+--------------------+--------------------+--------------------+--------------------+
|The son of a Loui...|The son of a Loui...|The son of a Loui...|[son, louisiana, ...|
|Debt: $20, 000, S...|Debt   Source Col...|Debt Source Colle...|[debt, source, co...|
|It was late. I wa...|It was late I was...|It was late I was...|[late, drunk, nea...|
|I have been battl...|I have been battl...|I have been battl...|[battling, depres...|
|Three flatmates o...|Three flatmates o...|Three flatmates o...|[three, flatmates...|
|, Most people tak...| Most people take...| Most people take...|[, people, take, ...|
|The FBI has arres...|The FBI has arres...|The FBI has arres...|[fbi, arrested, n...|
|Donald Trump anno...|Donald Trump anno...|Donald Trump anno...|[donald, trump, a...|
|Most Olympic rowe...|Most Olympic rowe...|Most Olympi

## Count Vectorizer (AQUI ESTA FALLANDO)

Toma la cuenta de el numero de palabras que aparecen en un documento 

In [250]:
count_vec=CountVectorizer(inputCol='refined_tokens',
outputCol='features')


In [267]:
cv_df=count_vec.fit(refined_data).transform(tokens)


AttributeError: 'DataFrame' object has no attribute 'isNotNull'