<a href="https://colab.research.google.com/github/PiotrMaciejKowalski/BigData2022-actors/blob/Poczenie-baz-danych-nagrd-z-main-data/colabs/Wczytanie_danych.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup sparka

# Załadowanie nowych baz

##Setup sparka

In [None]:
!pip install pyspark py4j
!pip install -q findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz
!tar xf spark-3.3.1-bin-hadoop2.tgz

In [None]:
import pyspark
import findspark
from pyspark.sql import SparkSession
import os
import pandas as pd
from pyspark.sql.functions import split, explode, collect_list, first, array_distinct, min, max
from google.colab import drive

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop2"
spark=SparkSession.builder.appName('Colab').getOrCreate()
spark

#Pobranie danych

##Import danych

In [None]:
!wget https://datasets.imdbws.com/name.basics.tsv.gz
!wget https://datasets.imdbws.com/title.akas.tsv.gz
!wget https://datasets.imdbws.com/title.basics.tsv.gz
!wget https://datasets.imdbws.com/title.crew.tsv.gz
!wget https://datasets.imdbws.com/title.episode.tsv.gz
!wget https://datasets.imdbws.com/title.principals.tsv.gz
!wget https://datasets.imdbws.com/title.ratings.tsv.gz

##Rozpakowanie danych

In [None]:
!gzip -dc /content/name.basics.tsv.gz > name.basics.csv
!gzip -dc /content/title.akas.tsv.gz > title.akas.csv
!gzip -dc /content/title.basics.tsv.gz > title.basic.csv
!gzip -dc /content/title.crew.tsv.gz > title.crew.csv
!gzip -dc /content/title.episode.tsv.gz > title.episode.csv
!gzip -dc /content/title.principals.tsv.gz > title.principals.csv
!gzip -dc /content/title.ratings.tsv.gz > title.ratings.csv

#Wczytanie danych

##Wczytajmy dane z rozpakowanych plików

In [None]:
df_name_basics=spark.read.option("header", "true").option("delimiter", "\t").csv('name.basics.csv' ) 
df_title_akas=spark.read.option("header","true").option("delimiter", "\t").csv('title.akas.csv')
df_title_basic=spark.read.option("header","true").option("delimiter", "\t").csv('title.basic.csv')
df_title_crew=spark.read.option("header","true").option("delimiter", "\t").csv('title.crew.csv')
df_title_episode=spark.read.option("header","true").option("delimiter","\t").csv('title.episode.csv')
df_title_principals=spark.read.option("header","true").option("delimiter","\t").csv('title.principals.csv')
df_title_ratings=spark.read.option("header","true").option("delimiter","\t").csv('title.ratings.csv')

##Wyświetlmy dane

In [None]:
df_name_basics.show(3)
df_title_akas.show(3)
df_title_basic.show(3)
df_title_crew.show(3)
df_title_episode.show(3)
df_title_principals.show(3)
df_title_ratings.show(3)

# Złączenie istotnych kolumn

## Wybierzmy z tabel tylko istotne rekordy:
- df_title_basic: wybierzmy tylko rekordy z primaryProfession zawierającym "actor" lub "actress",
- df_title_principals: wybierzmy rekordy z category zawierającym "actor" lub "actress".

In [None]:
df_name_basics_selected = df_name_basics.filter("primaryProfession like '%actor%' or primaryProfession like '%actress%'")
df_title_principals_selected = df_title_principals.filter((df_title_principals.category == "actor") | (df_title_principals.category == "actress"))

## Wybierzmy następujące kolumny z tabel:
- df_title_basic:
  - tconst (unikalny numer tytułu - potrzebne do złączenia tabel)
  - titleType (rodzaj tytułu, np. film, serial - można porównywać podobieństwo aktorów na podstawie tego w jakich typach produkcji grali)
  - originalTitle (tytuł produkcji - być może przyda się do analizy podobnych filmów)
  - isAdult (czy produkcja jest dla dorosłych - można wykorzystać do porównywania aktorów na podstawie tego czy grają głównie w produkcjach dla dorosłych)
  - startYear (data wypuszczenia produkcji - można wykorzystać do porównania epok filmów, w których grali aktorzy)
  - endYear (data zakończenia serii, w innych przypadkach \N - można wykorzystać do porównania epok filmów, w których grali aktorzy)
  - genres (gatunki - może posłużyć do porównania aktorów na podstawie tego, że grali w produkcjach o podobnych gatunkach)

- df_title_principals:
  - tconst (unikalny numer tytułu - potrzebne do złączenia tabel)
  - nconst (unikalny numer aktora - potrzebne do złączenia tabel)
  - category (kategoria pracy (actor/actress) - może posłużyć do porównania aktorów na podstawie takiej samej płci)
  - characters (postacie zagrane w produkcji - może posłużyć do porównania aktorów na podstawie podobnych granych postaci)

- df_name_basics:
  - nconst(unikalny numer aktora - potrzebne do złączenia tabel)
  - primaryName (imię i nazwisko aktora - potrzebne do wyświetlania aktora lub do odnajdywania go w bazie po imieniu i nazwisku)
  - knownForTitles (tytuły produkcji, z których znany jest aktor - może posłużyć do porównania aktorów na podstawie tych samych lub podobnych produkcji, z których są znani)

In [None]:
df_title_basic_selected = df_title_basic.select(["tconst", "titleType", "originalTitle", "isAdult", "startYear", "endYear", "genres"])
df_title_principals_selected = df_title_principals_selected.select(["tconst", "nconst", "category", "characters"])
df_name_basics_selected = df_name_basics_selected.select(["nconst", "primaryName", "knownForTitles"])

# Złączmy tabele df_name_basics, df_title_principals i df_title_basic

In [None]:
print("df_name_basics_selected dataframe size: ", (df_name_basics_selected.count(), len(df_name_basics_selected.columns)))
print("df_title_principals_selected dataframe size: ", (df_title_principals_selected.count(), len(df_title_principals_selected.columns)))
print("df_title_basic_selected dataframe size: ", (df_title_basic_selected.count(), len(df_title_basic_selected.columns)))
data = df_title_basic_selected.join(df_title_principals_selected, "tconst", "right")
print("joined dataframe size: ", (data.count(), len(data.columns)))
data = data.join(df_name_basics_selected, "nconst", "inner")
print("joined dataframe size: ", (data.count(), len(data.columns)))

## Łączymy dane na temat jednego aktora w jeden rekord

In [None]:
data = data.groupby('nconst').agg(collect_list('tconst').alias("tconst"), collect_list('titleType').alias("titleType"), collect_list('originalTitle').alias("originalTitle"), collect_list('isAdult').alias("isAdult"), min('startYear').alias("startYear"), max('endYear').alias("endYear"), collect_list('genres').alias("genres"), first('category').alias("category"), collect_list('characters').alias("characters"), first('primaryName').alias("primaryName"), first('knownForTitles').alias("knownForTitles"))


## Łączymy dane na temat jednego aktora w jeden rekord

In [None]:
data = data.groupby('nconst').agg(collect_list('tconst').alias("tconst"), collect_list('titleType').alias("titleType"), collect_list('originalTitle').alias("originalTitle"), collect_list('isAdult').alias("isAdult"), min('startYear').alias("startYear"), max('endYear').alias("endYear"), collect_list('genres').alias("genres"), first('category').alias("category"), collect_list('characters').alias("characters"), first('primaryName').alias("primaryName"), first('knownForTitles').alias("knownForTitles"))


# Wybierzmy przykładowe wiersze z końcowego dataframe'u

In [None]:
sample = data.rdd.takeSample(False, 5)

In [None]:
sample = pd.DataFrame(sample, columns = data.columns)
sample

# Załadowanie nowych baz

   **W celu załadowania nowych danych należy:**


1. Pobrać token API:
  * Zalogować się na stronę *www.kaggle.com*
  * Kliknąć "Account"
  * Znaleźć zakładkę API i nacisnąć przycisk "*Create New API Token*" - w tym miejscu pobierany jest plik *kaggle.json* - API
2. Utworzyć plik na swoim dysku pod nazwą "Big Data"
3. Wrzucić pobrane API (plik *kaggle.json*) na swój dysk
4. Uruchomić kod! :)




## Wczytajmy bazę oskarów: "Best Movies Watchlist"


In [None]:
drive.mount('/content/gdrive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/Big Data"

Mounted at /content/gdrive


In [None]:
! pip install kaggle
! mkdir ~/.kaggle 
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
! kaggle datasets download unanimad/the-oscar-award #załadujmy dane
! unzip the-oscar-award.zip #rozpakujmy z zip'a
oscars=spark.read.option("header","true").csv('the_oscar_award.csv')
oscars.show(5)


Downloading the-oscar-award.zip to /content
  0% 0.00/191k [00:00<?, ?B/s]
100% 191k/191k [00:00<00:00, 11.4MB/s]
Archive:  the-oscar-award.zip
  inflating: the_oscar_award.csv     
+---------+-------------+--------+--------+-------------------+----------------+------+
|year_film|year_ceremony|ceremony|category|               name|            film|winner|
+---------+-------------+--------+--------+-------------------+----------------+------+
|     1927|         1928|       1|   ACTOR|Richard Barthelmess|       The Noose| False|
|     1927|         1928|       1|   ACTOR|      Emil Jannings|The Last Command|  True|
|     1927|         1928|       1| ACTRESS|     Louise Dresser| A Ship Comes In| False|
|     1927|         1928|       1| ACTRESS|       Janet Gaynor|      7th Heaven|  True|
|     1927|         1928|       1| ACTRESS|     Gloria Swanson|  Sadie Thompson| False|
+---------+-------------+--------+--------+-------------------+----------------+------+
only showing top 5 rows


Main Data:
* primaryName	- aktor
* originalTitle - nazwa filmu

W nowej bazie oskarów:
* name - aktor
* film - film

## Wczytajmy bazę złotych globów: "Golden Globe Awards, 1944 - 2020"

In [None]:
! kaggle datasets download unanimad/golden-globe-awards
! unzip golden-globe-awards.zip
globe=spark.read.option("header","true").csv('golden_globe_awards.csv')
globe.show(5)

Downloading golden-globe-awards.zip to /content
  0% 0.00/117k [00:00<?, ?B/s]
100% 117k/117k [00:00<00:00, 41.7MB/s]
Archive:  golden-globe-awards.zip
  inflating: golden_globe_awards.csv  
+---------+----------+--------+--------------------+--------------------+--------------------+----+
|year_film|year_award|ceremony|            category|             nominee|                film| win|
+---------+----------+--------+--------------------+--------------------+--------------------+----+
|     1943|      1944|       1|Best Performance ...|      Katina Paxinou|For Whom The Bell...|True|
|     1943|      1944|       1|Best Performance ...|       Akim Tamiroff|For Whom The Bell...|True|
|     1943|      1944|       1|Best Director - M...|          Henry King|The Song Of Berna...|True|
|     1943|      1944|       1|             Picture|The Song Of Berna...|                null|True|
|     1943|      1944|       1|Actress In A Lead...|      Jennifer Jones|The Song Of Berna...|True|
+------

## Wczytajmy bazę złotych globów: "Primetime Emmy Awards, 1949-2017"

In [None]:
! kaggle datasets download unanimad/emmy-awards
! unzip emmy-awards.zip 
emmy_awards_category=spark.read.option("header","true").csv('emmy_awards_categories.csv')
emmy_awards_category.show(5)


Downloading emmy-awards.zip to /content
  0% 0.00/904k [00:00<?, ?B/s]
100% 904k/904k [00:00<00:00, 45.2MB/s]
Archive:  emmy-awards.zip
  inflating: emmy_awards_categories.csv  
  inflating: the_emmy_awards.csv     
+------------------+--------+----+
|              name|category|role|
+------------------+--------+----+
|     Comedy Series|  Comedy|main|
|        Lead Actor|  Comedy|main|
|      Lead Actress|  Comedy|main|
|  Supporting Actor|  Comedy|main|
|Supporting Actress|  Comedy|main|
+------------------+--------+----+
only showing top 5 rows



In [None]:
emmy_awards=spark.read.option("header","true").csv('the_emmy_awards.csv')
emmy_awards.show(5)

+---+----+--------------------+--------------------+--------------------+--------+--------------------+-----+
| id|year|            category|             nominee|               staff| company|            producer|  win|
+---+----+--------------------+--------------------+--------------------+--------+--------------------+-----+
|  1|2019|Outstanding Chara...|        The Simpsons|Hank Azaria, as M...|     FOX|Gracie Films in a...|False|
|  2|2019|Outstanding Chara...|          Family Guy|Alex Borstein, as...|     FOX|20th Century Fox ...|False|
|  3|2019|Outstanding Chara...|When You Wish Upo...|Eric Jacobson, as...|     HBO|Sesame Street Wor...|False|
|  4|2019|Outstanding Chara...|     F Is For Family|Kevin Michael Ric...| Netflix|Wild West Televis...|False|
|  5|2019|Outstanding Produ...| Escape At Dannemora|Mark Ricker, Prod...|Showtime|Red Hour, Busyhan...|False|
+---+----+--------------------+--------------------+--------------------+--------+--------------------+-----+
only showi

## Zaczytanie danych TMDB 

   informacje o aktorach i osobach pracujących przy produkcji filmów oraz budżetach, gatunkach, popularności, wytwórniach filmowych, oryginalnyh językach i tytułach


In [None]:
! kaggle datasets download tmdb/tmdb-movie-metadata
! unzip tmdb-movie-metadata.zip
tmdb_credits=spark.read.option("header","true").csv('tmdb_5000_credits.csv')
tmdb_movies=spark.read.option("header","true").csv('tmdb_5000_movies.csv')

Downloading tmdb-movie-metadata.zip to /content
  0% 0.00/8.89M [00:00<?, ?B/s] 56% 5.00M/8.89M [00:00<00:00, 29.8MB/s]
100% 8.89M/8.89M [00:00<00:00, 46.4MB/s]
Archive:  tmdb-movie-metadata.zip
  inflating: tmdb_5000_credits.csv   
  inflating: tmdb_5000_movies.csv    


In [None]:
tmdb_credits.show(5)

+--------+--------------------+-------------------+--------------------+
|movie_id|               title|               cast|                crew|
+--------+--------------------+-------------------+--------------------+
|   19995|              Avatar|"[{""cast_id"": 242| ""character"": "...|
|     285|Pirates of the Ca...|  "[{""cast_id"": 4| ""character"": "...|
|  206647|             Spectre|  "[{""cast_id"": 1| ""character"": "...|
|   49026|The Dark Knight R...|  "[{""cast_id"": 2| ""character"": "...|
|   49529|         John Carter|  "[{""cast_id"": 5| ""character"": "...|
+--------+--------------------+-------------------+--------------------+
only showing top 5 rows



In [None]:
tmdb_movies.show(5)

# Połączmy dane na temat nagród z głównymi danymi

Wybierzmy tylko rekordy z interesujących nas kategorii

In [None]:
oscars_selected = oscars.filter((oscars.category.like('%ACTOR%')) | (oscars.category.like('%ACTRESS%')))
globe_selected = globe.filter((globe.category.like('%Actor%')) | (globe.category.like('%Actress%')))

In [None]:
actor_categories = ['Outstanding Lead Actor in a Comedy Series', 'Outstanding Lead Actor in a Drama Series', 'Outstanding Lead Actor in a Limited or Anthology Series or Movie', 'Outstanding Lead Actress in a Comedy Series', 'Outstanding Lead Actress in a Drama Series', 'Outstanding Lead Actress in a Limited or Anthology Series or Movie', 'Outstanding Supporting Actor in a Comedy Series', 'Outstanding Supporting Actor in a Drama Series', 'Outstanding Supporting Actor in a Limited or Anthology Series or Movie', 'Outstanding Supporting Actress in a Comedy Series', 'Outstanding Supporting Actress in a Drama Series', 'Outstanding Supporting Actress in a Limited or Anthology Series or Movie', 'Outstanding Character Voice-Over Performance', 'Outstanding Guest Actor in a Drama Series', 'Outstanding Guest Actor in a Comedy Series', 'Outstanding Guest Actress in a Drama Series', 'Outstanding Guest Actress in a Comedy Series', 'Outstanding Narrator', 'Outstanding Actor in a Short Form Comedy or Drama Series', 'Outstanding Actress in a Short Form Comedy or Drama Series', 'Best Specialty Act – Single or Group', 'Outstanding Voice-Over Performance', 'Outstanding Sports Personality', 'Most Outstanding Live Personality', 'Most Outstanding Kinescoped Personality']
emmy_awards_selected = emmy_awards.filter(emmy_awards.category.isin(actor_categories))

Przygotujmy dane z tabeli emmy_awards_selected do połączenia (wybieramy tylko imię i nazwisko aktora z kolumny staff)

In [None]:
emmy_awards_selected = emmy_awards_selected.withColumn('staff',explode(split('staff',', ')))
emmy_awards_selected = emmy_awards_selected.groupby('nominee', 'staff').agg(first('id').alias('id'), first('year').alias('year'), first('category').alias('category'), first('nominee').alias('nominee'), first('staff').alias('staff'), first('company').alias('company'), first('producer').alias('producer'), first('win').alias('win'))

Wybierzmy tylko kolumny, które nas interesują z danych na temat nagród

In [None]:
oscars_selected = oscars.select(["year_ceremony", "category", "name", "film", "winner"])
globe_selected = globe.select(["year_award", "category", "nominee", "film", "win"])
emmy_awards_selected = emmy_awards.select(["year", "category", "nominee", "staff", "company", "producer", "win"])

Zmieńmy nazwy kolumn, aby wiadomo było po połączeniu, któe dane odnoszą się do których nagród

In [None]:
oscars_selected = oscars_selected.withColumnRenamed("year_ceremony","year_oscars").withColumnRenamed("category","category_oscars").withColumnRenamed("film","film_oscars").withColumnRenamed("winner","winner_oscars")

globe_selected = globe_selected.withColumnRenamed("year_award","year_globes").withColumnRenamed("category","category_globes").withColumnRenamed("film","film_globes").withColumnRenamed("win","win_globes")

emmy_awards_selected = emmy_awards_selected.withColumnRenamed("year","year_emmy").withColumnRenamed("category","category_emmy").withColumnRenamed("nominee","nominee_emmy").withColumnRenamed("company","company_emmy").withColumnRenamed("producer","producer_emmy").withColumnRenamed("win","win_emmy")

Złączmy dane dotyczące nagród z głównymi danymi

In [None]:
print("data dataframe size: ", (data.count(), len(data.columns)))
print("oscars_selected dataframe size: ", (oscars_selected.count(), len(oscars_selected.columns)))
print("globe_selected dataframe size: ", (globe_selected.count(), len(globe_selected.columns)))
print("emmy_awards_selected dataframe size: ", (emmy_awards_selected.count(), len(emmy_awards_selected.columns)))
data = data.join(oscars_selected, data.primaryName == oscars_selected.name, "left")
print("joined dataframe size: ", (data.count(), len(data.columns)))
data = data.join(globe_selected, data.primaryName == globe_selected.nominee, "left")
print("joined dataframe size: ", (data.count(), len(data.columns)))
data = data.join(emmy_awards_selected, data.primaryName == emmy_awards_selected.staff, "left")
print("joined dataframe size: ", (data.count(), len(data.columns)))

In [None]:
data = data.groupby('nconst').agg(first('tconst').alias("tconst"), first('titleType').alias("titleType"), first('originalTitle').alias("originalTitle"), first('isAdult').alias("isAdult"), first('startYear').alias("startYear"), first('endYear').alias("endYear"), first('genres').alias("genres"), first('category').alias("category"), first('characters').alias("characters"), first('primaryName').alias("primaryName"), first('knownForTitles').alias("knownForTitles"), collect_list('year_oscars').alias("year_oscars"), collect_list('category_oscars').alias("category_oscars"), collect_list('film_oscars').alias("film_oscars"), collect_list('winner_oscars').alias("winner_oscars"), collect_list('year_globes').alias("year_globes"), collect_list('category_globes').alias("category_globes"), collect_list('film_globes').alias("film_globes"), collect_list('win_globes').alias("win_globes"), collect_list('year_emmy').alias("year_emmy"), collect_list('category_emmy').alias("category_emmy"), collect_list('nominee_emmy').alias("nominee_emmy"), collect_list('company_emmy').alias("company_emmy"), collect_list('producer_emmy').alias("producer_emmy"), collect_list('win_emmy').alias("win_emmy"))

Pokażmy kilka przykłądowych wierszy ostatecznej tabeli

In [None]:
sample = data.rdd.takeSample(False, 5)

In [None]:
sample = pd.DataFrame(sample, columns = data.columns)
sample