# ETL
## Trabalho Final | Prática | Grupo G
Base de dados usada: https://www.kaggle.com/datasets/whenamancodes/popular-movies-datasets-58000-movies?select=tags.csv'

Slide: [Canva](https://www.canva.com/design/DAFYb14LbAA/J_Kk7ndEoZM1m3Tw_glTIA/edit?utm_content=DAFYb14LbAA&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton)

|Alunos|Matrícula|
|--|--|
|Victor Buendia Cruz De Alvim|19/0020601|
|Lucas Ursulino Boaventura|18/0114093|
|Yudi Yamane de Azevedo|16/0149410|

##  ⚙️ Setup ===========================

In [1]:
pip install opendatasets pandas pandasql findspark pyspark --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pandasql import sqldf
import opendatasets as od
import pandas
import os
import re

In [3]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

23/01/29 09:43:36 WARN Utils: Your hostname, Victors-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.8 instead (on interface en0)
23/01/29 09:43:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/29 09:43:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from pyspark.sql import functions as F

## ⬇️ Extract =============================

In [5]:
if not os.path.exists('./popular-movies-datasets-58000-movies/'):
    kaggleAPI = input('[TOKEN API] Insire seu Token API do Kaggle:')

    fp = open('kaggle.json', 'w')
    fp.write(kaggleAPI)
    fp.close()

    od.download(
        'https://www.kaggle.com/datasets/whenamancodes/popular-movies-datasets-58000-movies?select=tags.csv')

    os.remove('kaggle.json')

In [6]:
csvs = {}
for dirname, _, filenames in os.walk('./popular-movies-datasets-58000-movies/'):
    for filename in filenames:
        csvs[filename] = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

./popular-movies-datasets-58000-movies/links.csv
./popular-movies-datasets-58000-movies/tags.csv
./popular-movies-datasets-58000-movies/genome-tags.csv
./popular-movies-datasets-58000-movies/ratings.csv
./popular-movies-datasets-58000-movies/genome-scores.csv
./popular-movies-datasets-58000-movies/movies.csv


In [7]:
dfs = {}

for filename in csvs:
    file=(csvs[filename])
    newData = spark.read.format("csv").option("header","true").load(file)
    dfs[re.sub('-', '_', re.findall("(.+).csv", filename)[0])] = newData
    newData.createOrReplaceTempView(re.sub('-', '_', re.findall("(.+).csv", filename)[0]))

                                                                                

In [8]:
for df in dfs:
    print(df,'\n |-->', dfs[df], '\n')

links 
 |--> DataFrame[movieId: string, imdbId: string, tmdbId: string] 

tags 
 |--> DataFrame[userId: string, movieId: string, tag: string, timestamp: string] 

genome_tags 
 |--> DataFrame[tagId: string, tag: string] 

ratings 
 |--> DataFrame[userId: string, movieId: string, rating: string, timestamp: string] 

genome_scores 
 |--> DataFrame[movieId: string, tagId: string, relevance: string] 

movies 
 |--> DataFrame[movieId: string, title: string, genres: string] 



# ✨ Transform =========================

In [9]:
def sqlR(sql):
    return (spark.sql(sql).toPandas())

def tView(sql, name):
    spark.sql(sql).createOrReplaceTempView(name)

## Movies

In [10]:
spark.table('movies')\
.withColumn('genres', F.split(spark.table('movies')['genres'], '\|'))\
.withColumn('publish_year', F.regexp_extract(F.col('title'), '\((\d+)\)', 1))\
.withColumn('title', F.regexp_extract(F.col('title'), '(^.+)\(', 1))\
.createOrReplaceTempView('movies')

sql = """
    SELECT * FROM movies
"""

sqlR(sql)


                                                                                

Unnamed: 0,movieId,title,genres,publish_year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
58093,193876,The Great Glinka,[(no genres listed)],1946
58094,193878,Les tribulations d'une caissière,[Comedy],2011
58095,193880,Her Name Was Mumu,[Drama],2016
58096,193882,Flora,"[Adventure, Drama, Horror, Sci-Fi]",2017


In [11]:
sql = """

WITH M AS (
    SELECT
        M.movieId
        , M.title
        , M.genres
        , M.publish_year
        , SUM(R.rating)/COUNT(R.rating) AS average_rating
        , COUNT(DISTINCT R.userId) AS user_review_amount
        , COUNT(R.rating) AS review_amount
    FROM
        movies AS M
    LEFT JOIN
        ratings AS R
        ON TRUE
            AND M.movieId = R.movieId
    GROUP BY
        1,2,3,4
)

SELECT * FROM M


"""

tView(sql, 'movie_reviews')
sqlR(sql)

                                                                                

Unnamed: 0,movieId,title,genres,publish_year,average_rating,user_review_amount,review_amount
0,100010,Battle of Los Angeles,"[Action, Sci-Fi]",2011,2.478261,46,46
1,100060,Sunny (Sseo-ni),[Drama],2011,3.645161,31,31
2,100062,My Way (Mai Wei),"[Action, Drama, War]",2011,3.621622,37,37
3,100068,Comme un chef,[Comedy],2012,3.568627,102,102
4,100070,Punching the Clown,[Comedy],2009,3.538462,13,13
...,...,...,...,...,...,...,...
58093,990,Maximum Risk,"[Action, Adventure, Thriller]",1996,2.731707,984,984
58094,99043,Trishna,[Drama],2011,3.153846,13,13
58095,99566,"True Meaning of Christmas Specials, The",[Comedy],2002,3.500000,1,1
58096,99600,"Man Who Haunted Himself, The","[Drama, Fantasy, Horror, Mystery, Thriller]",1970,3.300000,5,5


In [12]:
sql = """

WITH M AS (
    SELECT
        M.movieId
        , M.title
        , M.genres
        , M.publish_year
        , COUNT(DISTINCT T.userId) AS user_tag_amount
        , COUNT(T.tag) AS tag_amount
        , ARRAY_AGG(DISTINCT T.tag) AS tags
    FROM
        movies AS M
    LEFT JOIN
        tags AS T
        ON TRUE
            AND T.movieId = M.movieId
    GROUP BY
        1,2,3,4
)

SELECT * FROM M


"""

tView(sql, 'movie_tags')
sqlR(sql)

                                                                                

Unnamed: 0,movieId,title,genres,publish_year,user_tag_amount,tag_amount,tags
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,252,782,"[ss, Tumey's VHS, funny, bullying, avi, animat..."
1,10,GoldenEye,"[Action, Adventure, Thriller]",1995,64,152,"[boys with toys, secret base, btaege, I wanted..."
2,100,City Hall,"[Drama, Thriller]",1996,9,20,"[war on drugs, mayor, drugs, Al Pacino, own, i..."
3,1000,Curdled,[Crime],1996,2,10,"[suspense, cleaning lady, bakery, samurai swor..."
4,100001,"Comic, The","[Comedy, Drama]",1969,1,1,[silent film star]
...,...,...,...,...,...,...,...
58093,99989,Bonsái,[Drama],2011,2,2,"[Cristián Jiménez, nudity (topless)]"
58094,99992,Shadow Dancer,"[Crime, Drama, Thriller]",2012,4,7,"[prospect preferred, James Marsh, IRA, spy, Cl..."
58095,99994,Thale,"[Action, Drama, Horror, Mystery]",2012,4,13,"[Not Horror, Not Science Fiction, Extended Nud..."
58096,99996,It's a Disaster,"[Comedy, Drama]",2012,9,25,"[dark comedy, profanity, smart, group of frien..."


In [13]:
sql = """

SELECT
    M.movieId
    , M.title
    , M.genres
    , M.publish_year
    , MT.user_tag_amount
    , MT.tag_amount
    , MT.tags
    , MR.average_rating
    , MR.user_review_amount
    , MR.review_amount
FROM
    movies AS M
LEFT JOIN
    movie_reviews AS MR
    ON TRUE
        AND M.movieId = MR.movieId
LEFT JOIN
    movie_tags AS MT
    ON TRUE
        AND M.movieId = MT.movieId
ORDER BY
    M.title


"""



sqlR(sql)

                                                                                

Unnamed: 0,movieId,title,genres,publish_year,user_tag_amount,tag_amount,tags,average_rating,user_review_amount,review_amount
0,123619,,[(no genres listed)],,0,0,[],2.000000,1,1
1,125632,,[(no genres listed)],,0,0,[],4.500000,1,1
2,125958,,[(no genres listed)],,2,8,"[documentary, travel, Stephen Fry, road trip, ...",3.444444,18,18
3,128734,,"[Comedy, Musical]",,1,17,"[Do zassania, punk, Marian Dziedziel, Robert B...",3.250000,2,2
4,133276,,[Action],,4,19,"[revenge, maori, Warriors, Fight Scenes, Canni...",3.250000,16,16
...,...,...,...,...,...,...,...,...,...,...
58093,159678,…And the Fifth Horseman Is Fear,"[Drama, War]",1965,0,0,[],3.250000,2,2
58094,132604,キサラギ,"[Comedy, Mystery]",2007,1,2,"[comedy, small room]",3.666667,3,3
58095,132319,チェブラーシカ,"[Animation, Children]",2010,1,4,"[Russian, USSR, puppet, talking animals]",3.150000,10,10
58096,130640,貞子3D,[Horror],2012,2,7,"[online, ringu, vengeful ghost, japan, sadako ...",2.136364,11,11
