In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('IMDb - TPM2 IGTI DeCloud') \
    .getOrCreate()

23/04/07 01:09:32 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/04/07 01:09:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/07 01:09:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
title_basics = spark.read.csv('data/title_basics.tsv', header=True, sep='\t')

In [4]:
title_basics.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [5]:
from pyspark.sql.functions import split

title_basics = title_basics.withColumn("genres_split", split(title_basics["genres"], ","))

In [6]:
from pyspark.sql.functions import explode

title_basics = title_basics.select(["*", explode("genres_split").alias("genre")]).drop("genres_split")


In [7]:
title_basics.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-----------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|      genre|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-----------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|Documentary|
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|      Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|  Animation|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|      Short|
|tt0000003|    short|      Pauvre 

In [8]:
title_basics.count()

                                                                                

12998329

In [9]:
title_ratings = spark.read.csv('data/title_ratings.tsv', header=True, sep='\t')
title_ratings.show(5)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1809|
|tt0000002|          6.0|     233|
|tt0000003|          6.5|    1560|
|tt0000004|          6.1|     152|
|tt0000005|          6.2|    2383|
+---------+-------------+--------+
only showing top 5 rows



In [10]:
title_ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [11]:
title = title_basics.join(title_ratings, title_basics.tconst == title_ratings.tconst, how='left_outer').drop(title_ratings.tconst)

In [12]:
title.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- genre: string (nullable = false)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [13]:
#title.groupBy('genre').count().orderBy('count', ascending=False).show()
title.groupBy('genre').count().sort('count', ascending=False).show()



+-----------+-------+
|      genre|  count|
+-----------+-------+
|      Drama|2247995|
|     Comedy|1653725|
|      Short|1021850|
|  Talk-Show| 900198|
|Documentary| 764885|
|    Romance| 724729|
|         \N| 643012|
|     Family| 571470|
|       News| 524662|
| Reality-TV| 423455|
|  Animation| 406284|
|      Music| 394008|
|      Crime| 351447|
|     Action| 334580|
|  Adventure| 324325|
|  Game-Show| 252533|
|      Adult| 242704|
|      Sport| 178594|
|    Fantasy| 174119|
|    Mystery| 162448|
+-----------+-------+
only showing top 20 rows



                                                                                

In [23]:
from pyspark.sql.functions import countDistinct

movies_2015 = title.filter(title.startYear == '2015')
movies_2015 = movies_2015.select(countDistinct('tconst').alias('filmes lançados em 2015'))
movies_2015.show()



+----------------------+
|files lançados em 2015|
+----------------------+
|                358054|
+----------------------+



                                                                                

In [15]:
from pyspark.sql.functions import avg, desc

# Agrupa por gênero e calcula a média de averageRating
grouped = title.groupBy('genre').agg(avg('averageRating').alias('avg_rating'))

# Ordena pelo valor médio da nota em ordem decrescente
sorted_grouped = grouped.sort(desc('avg_rating'))

sorted_grouped.show()


[Stage 20:>                                                         (0 + 4) / 4]

23/04/07 01:10:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/07 01:10:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
+-----------+-----------------+
|      genre|       avg_rating|


                                                                                