# ETL
## Trabalho Final | Prática | Grupo G
Base de dados usada: https://www.kaggle.com/datasets/whenamancodes/popular-movies-datasets-58000-movies?select=tags.csv'

Slide: [Canva](https://www.canva.com/design/DAFYb14LbAA/J_Kk7ndEoZM1m3Tw_glTIA/edit?utm_content=DAFYb14LbAA&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton)

|Alunos|Matrícula|
|--|--|
|Victor Buendia Cruz De Alvim|19/0020601|
|Lucas Ursulino Boaventura|18/0114093|
|Yudi Yamane de Azevedo|16/0149410|

##  ⚙️ Setup ===========================

In [1]:
pip install opendatasets pandas pandasql findspark pyspark --quiet

In [2]:
from pandasql import sqldf
import opendatasets as od
import pandas
import os
import re

In [3]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [4]:
from pyspark.sql import functions as F

## ⬇️ Extract =============================

In [5]:
if not os.path.exists('./popular-movies-datasets-58000-movies/'):
    kaggleAPI = input('[TOKEN API] Insire seu Token API do Kaggle:')

    fp = open('kaggle.json', 'w')
    fp.write(kaggleAPI)
    fp.close()

    od.download(
        'https://www.kaggle.com/datasets/whenamancodes/popular-movies-datasets-58000-movies?select=tags.csv')

    os.remove('kaggle.json')

In [6]:
csvs = {}
for dirname, _, filenames in os.walk('./popular-movies-datasets-58000-movies/'):
    for filename in filenames:
        csvs[filename] = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

In [7]:
dfs = {}

for filename in csvs:
    file=(csvs[filename])
    newData = spark.read.format("csv").option("header","true").load(file)
    dfs[re.sub('-', '_', re.findall("(.+).csv", filename)[0])] = newData
    newData.createOrReplaceTempView(re.sub('-', '_', re.findall("(.+).csv", filename)[0]))

In [8]:
for df in dfs:
    print(df,'\n |-->', dfs[df], '\n')

# ✨ Transform =========================

In [9]:
def sqlR(sql):
    return (spark.sql(sql).toPandas())

## Movies

In [10]:
spark.table('movies')\
.withColumn('genres', F.split(spark.table('movies')['genres'], '\|'))\
.createOrReplaceTempView('movies')

In [11]:
sql = """

WITH M AS (
    SELECT
        M.movieId
        , M.title
        , M.genres
        , SUM(R.rating)/COUNT(R.rating) AS average_rating
        , COUNT(DISTINCT R.userId) AS user_review_amount
        , COUNT(R.rating) AS review_amount
    FROM
        movies AS M
    JOIN
        ratings AS R
        ON TRUE
            AND M.movieId = R.movieId
    GROUP BY
        1,2,3
)

SELECT * FROM M


"""


sqlR(sql)