In [1]:
# Inicializando as bibliotecas 

from pyspark.sql.types import StructType, IntegerType, DateType, StringType,StructField, FloatType
from pyspark.sql.functions import col, isnan, when, count
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

Uma SparkSession pode ser usada para criar DataFrame, registrar DataFrame como tabelas, executar SQL em tabelas, armazenar tabelas em cache e ler arquivos em parquet. Para criar uma sessão Spark, siga o padrão de construção abaixo

In [2]:
spark = SparkSession.builder.appName('recnn').getOrCreate()

In [3]:
schema = StructType([StructField('user_id', IntegerType()),
                     StructField('item_id', IntegerType()),
                     StructField('rating', IntegerType()),
                     StructField('item', StringType()),
                    ])

In [4]:
# Importando o dataset Rafael

data = spark.read.schema(schema).option("header", True).format('csv').load('recomendacao.csv')

In [5]:
# Importando o dataset Evancleide

# data = spark.read \
#         .schema(schema)\
#         .format("csv")\
#         .option("header",True)\
#         .load("C:/Users/evanc/Documents/00_Pos_Big Data e Ciencia de Dados/014_Solucoes Proc Paralelo e Distribuido/Trabalho_Spark/recomendacao.csv")

In [6]:
type(data)

pyspark.sql.dataframe.DataFrame

In [7]:
data.describe().show()

+-------+------------------+-----------------+------------------+--------------------+
|summary|           user_id|          item_id|            rating|                item|
+-------+------------------+-----------------+------------------+--------------------+
|  count|             99999|            99999|             99999|               99547|
|   mean| 462.4874148741487|425.5319653196532|3.5298652986529864|                null|
| stddev|266.61442141060337|330.7995012117287| 1.125677980540639|                null|
|    min|                 1|                1|                 1|'Til There Was Yo...|
|    max|               943|             1682|                 5|Á köldum klaka (C...|
+-------+------------------+-----------------+------------------+--------------------+



In [8]:
# Verificando a quantidade de dados nulos.

data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

data.groupBy().count().show()

+-------+-------+------+----+
|user_id|item_id|rating|item|
+-------+-------+------+----+
|      0|      0|     0| 452|
+-------+-------+------+----+

+-----+
|count|
+-----+
|99999|
+-----+



In [9]:
data.show(5)
data.summary().show()

+-------+-------+------+--------------------+
|user_id|item_id|rating|                item|
+-------+-------+------+--------------------+
|    186|    302|     3|L.A. Confidential...|
|     22|    377|     1| Heavyweights (1994)|
|    244|     51|     2|Legends of the Fa...|
|    166|    346|     1| Jackie Brown (1997)|
|    298|    474|     4|Dr. Strangelove o...|
+-------+-------+------+--------------------+
only showing top 5 rows

+-------+------------------+-----------------+------------------+--------------------+
|summary|           user_id|          item_id|            rating|                item|
+-------+------------------+-----------------+------------------+--------------------+
|  count|             99999|            99999|             99999|               99547|
|   mean| 462.4874148741487|425.5319653196532|3.5298652986529864|                null|
| stddev|266.61442141060337|330.7995012117287| 1.125677980540639|                null|
|    min|                 1|           

In [10]:
data.describe()

DataFrame[summary: string, user_id: string, item_id: string, rating: string, item: string]

3) Remover os dados nulos de cada coluna.

In [11]:
data=data.where(col("user_id").isNotNull())

In [12]:
data=data.where(col("item_id").isNotNull())

In [13]:
data=data.where(col("rating").isNotNull())

In [14]:
data=data.where(col("item").isNotNull())

Dados após limpeza

In [15]:
data.show(5)
data.summary().show()

+-------+-------+------+--------------------+
|user_id|item_id|rating|                item|
+-------+-------+------+--------------------+
|    186|    302|     3|L.A. Confidential...|
|     22|    377|     1| Heavyweights (1994)|
|    244|     51|     2|Legends of the Fa...|
|    166|    346|     1| Jackie Brown (1997)|
|    298|    474|     4|Dr. Strangelove o...|
+-------+-------+------+--------------------+
only showing top 5 rows

+-------+-----------------+------------------+------------------+--------------------+
|summary|          user_id|           item_id|            rating|                item|
+-------+-----------------+------------------+------------------+--------------------+
|  count|            99547|             99547|             99547|               99547|
|   mean|462.4214692557284| 427.4595819060343|3.5282831225451297|                null|
| stddev| 266.581647089376|330.30761865180295|1.1262548341580603|                null|
|    min|                1|            

In [16]:
##Criação de view para usar o sql.
data.createOrReplaceTempView('viewdata')

In [17]:
#Mostrando view criada
spark.sql('select * from viewdata').show(5)

+-------+-------+------+--------------------+
|user_id|item_id|rating|                item|
+-------+-------+------+--------------------+
|    186|    302|     3|L.A. Confidential...|
|     22|    377|     1| Heavyweights (1994)|
|    244|     51|     2|Legends of the Fa...|
|    166|    346|     1| Jackie Brown (1997)|
|    298|    474|     4|Dr. Strangelove o...|
+-------+-------+------+--------------------+
only showing top 5 rows



Retorno das perguntas

In [18]:
#Verificação da quantidade de vezes cada filme foi assistido em ordem decrescente. Exibindo os 10 mais.

spark.sql('select item_id , item , count(item) as qtd_assistido \
          from viewdata \
          group by item_id, item \
          order by qtd_assistido desc').show(10)


+-------+--------------------+-------------+
|item_id|                item|qtd_assistido|
+-------+--------------------+-------------+
|     50|    Star Wars (1977)|          583|
|    258|      Contact (1997)|          509|
|    100|        Fargo (1996)|          508|
|    181|Return of the Jed...|          507|
|    294|    Liar Liar (1997)|          485|
|    286|English Patient, ...|          481|
|    288|       Scream (1996)|          478|
|    300|Air Force One (1997)|          431|
|    121|Independence Day ...|          429|
|    174|Raiders of the Lo...|          420|
+-------+--------------------+-------------+
only showing top 10 rows



4. 1) Qual o filme mais assistido?

In [19]:
spark.sql('select * \
            from (select item as Filme_Mais_Assistido, item_id as ID, count(item_id) as qtd_assistido \
                  from viewdata \
                  group by ID, Filme_Mais_Assistido) viewdata \
            where qtd_assistido== \
                (select max(contador) \
                 from (select count(item_id) as contador\
                       from viewdata \
                       group by item_id ) viewdata) ').show(truncate=False)

+--------------------+---+-------------+
|Filme_Mais_Assistido|ID |qtd_assistido|
+--------------------+---+-------------+
|Star Wars (1977)    |50 |583          |
+--------------------+---+-------------+



In [20]:
# Qual o filme mais assistido?

cont_filme = data.groupBy(["item", "item_id"]).count().sort('count', ascending=False)
cont_filme.limit(1).toPandas()

Unnamed: 0,item,item_id,count
0,Star Wars (1977),50,583


In [21]:
#Verificação da quantidade de votos dos usuários em ordem decrescente. Exibindo os 10 que mais pontuaram os filmes
#user_id

spark.sql('select user_id as ID_Usuario , count(user_id) as qtd_votos \
          from viewdata \
          group by ID_Usuario \
          order by qtd_votos desc').show(10)

+----------+---------+
|ID_Usuario|qtd_votos|
+----------+---------+
|       405|      737|
|       655|      684|
|        13|      635|
|       450|      539|
|       276|      517|
|       416|      492|
|       537|      489|
|       303|      483|
|       234|      479|
|       393|      447|
+----------+---------+
only showing top 10 rows



In [22]:
# Quantidade de votos dos usuários.

cont_user = data.groupBy("user_id").count().sort('count', ascending=False)
cont_user.limit(10).toPandas().set_index('user_id')

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
405,737
655,684
13,635
450,539
276,517
416,492
537,489
303,483
234,479
393,447


4. 2) Qual o usuário que mais pontuou os filmes?

In [23]:
spark.sql('select * \
            from (select user_id as ID_Usuario_Mais_Pontuou, count(user_id) as Total_Voto \
                  from viewdata \
                  group by ID_Usuario_Mais_Pontuou) viewdata \
            where Total_Voto== \
                (select max(contador) \
                 from (select count(user_id) as contador\
                       from viewdata \
                       group by user_id ) viewdata) ').show(truncate=False)

+-----------------------+----------+
|ID_Usuario_Mais_Pontuou|Total_Voto|
+-----------------------+----------+
|405                    |737       |
+-----------------------+----------+



4. 3) Quantos usuários deram nota(rating) = 5?

In [24]:
rating_5 = data.groupBy(['user_id','rating']).count()
total = rating_5[rating_5['rating'] == 5].count()
print('O total de',total, 'usuarios deram notas(rating)=5.')

O total de 927 usuarios deram notas(rating)=5.


4. 4) Agrupe o Dataset por user e some todas as suas notas. Qual o usuário possui a maior soma dos rating?

In [25]:
user_max = data.groupBy('user_id').sum('rating').sort('sum(rating)', ascending=False)
user_max.limit(1).toPandas().set_index('user_id').rename(columns={'sum(rating)':'Total rating'})

Unnamed: 0_level_0,Total rating
user_id,Unnamed: 1_level_1
450,2083


5) Gerar Dataset tratado para o algoritmo de recomendação no formato csv com o schema user_id, item_id, rating.

In [26]:
# Existe varios meios de exportar, neste trabalho foi selecionado um deles.

# data[['user_id', 'item_id', 'rating']].write.csv('dataset.csv')

data[['user_id', 'item_id', 'rating']].toPandas().to_csv('dataset.csv')

In [27]:
#soma total de notas de cada filme e ordena em ordem decrescente.
# data.select(data.rating, data.item, data.item_id)\
# .groupBy('item_id', 'item').sum('rating').withColumnRenamed("sum(rating)", "total_avaliacao")\
# .orderBy(col("total_avaliacao").desc()).show(truncate=False)