In [1]:
from pyspark.sql import SparkSession
import sys
import os

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# spark.stop()

In [4]:
warehouse_location = "hdfs://m1.local.br:9000/user/hive/warehouse"
hive_metastore_uri = "thrift://m1.local.br:9083"
defaultFS = "hdfs://m1.local.br:9000"

# 1 - Conectando ao Hive com PySpark

In [5]:
spark = SparkSession.builder \
    .appName("Exploratory Data Analysis - MovieLens dataset from Hive") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.hadoop.fs.defaultFS", defaultFS) \
    .config("hive.metastore.uris", hive_metastore_uri) \
    .enableHiveSupport() \
    .getOrCreate()

spark

In [6]:
spark.catalog.refreshByPath(warehouse_location)

In [7]:
print("URI do Metastore do Hive:", spark.conf.get("hive.metastore.uris"))
print("Endereço do Hadoop (HDFS):", spark.conf.get("spark.hadoop.fs.defaultFS"))

URI do Metastore do Hive: thrift://m1.local.br:9083
Endereço do Hadoop (HDFS): hdfs://m1.local.br:9000


## 1.1 - Testando conectividade

In [8]:
# Visualizando todos os datasets
databases = spark.sql("show databases")
databases.show()

+----------------+
|       namespace|
+----------------+
|         default|
|     flight_data|
|hpc_treinamentos|
|      myhivebook|
+----------------+



In [9]:
# Visualizando o nome do usuário
tables = spark.sql("SELECT current_user()")
tables.show()

+--------------+
|current_user()|
+--------------+
| Vinicius Luiz|
+--------------+



In [10]:
# Se conectando ao banco default
spark.sql("USE default")

DataFrame[]

In [11]:
# Visualizando as tabelas no database default
tables = spark.sql("SHOW tables")
tables.show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  default|    genome_scores|      false|
|  default|genome_scores_tmp|      false|
|  default|      genome_tags|      false|
|  default|  genome_tags_tmp|      false|
|  default|            links|      false|
|  default|        links_tmp|      false|
|  default|           movies|      false|
|  default|       movies_tmp|      false|
|  default|          ratings|      false|
|  default|      ratings_tmp|      false|
|  default|             tags|      false|
|  default|         tags_tmp|      false|
+---------+-----------------+-----------+



In [12]:
df_movies = spark.sql("SELECT * FROM movies")
df_movies.show(10, truncate=False)

+-------+----------------------------------+-------------------------------------------------+
|movieid|title                             |genres                                           |
+-------+----------------------------------+-------------------------------------------------+
|1      |Toy Story (1995)                  |[Adventure, Animation, Children, Comedy, Fantasy]|
|2      |Jumanji (1995)                    |[Adventure, Children, Fantasy]                   |
|3      |Grumpier Old Men (1995)           |[Comedy, Romance]                                |
|4      |Waiting to Exhale (1995)          |[Comedy, Drama, Romance]                         |
|5      |Father of the Bride Part II (1995)|[Comedy]                                         |
|6      |Heat (1995)                       |[Action, Crime, Thriller]                        |
|7      |Sabrina (1995)                    |[Comedy, Romance]                                |
|8      |Tom and Huck (1995)               |[Adven