In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [4]:
spark = SparkSession\
    .builder\
    .master('local')\
    .appName("lesson_12")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/21 14:36:28 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
film_actor = spark.read\
    .option("header", True)\
    .option("inferSchema", True)\
    .option("sep", ",")\
    .csv("/datalake/bronze/samples/film_actor.csv")

In [12]:
film_actor.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rental_rate: double (nullable = true)
 |-- replacement_cost: double (nullable = true)
 |-- language_id: integer (nullable = true)



In [5]:
film_actor.columns

['first_name',
 'last_name',
 'title',
 'release_year',
 'rental_rate',
 'replacement_cost',
 'language_id']

In [6]:
film_actor = film_actor\
    .withColumnRenamed('first_name', 'actor_first_name')\
    .withColumnRenamed('last_name', 'actor_last_name')\
    .withColumnRenamed('title', 'film_name')\
    .withColumnRenamed('release_year', 'film_year')\
    .withColumn('film_year', F.col('film_year').cast('int'))\
    .drop('rental_rate')

# film_actor = film_actor\
#    .withColumn('film_year', film_actor.film_year.cast('int'))\
#    .drop('rental_rate')

In [7]:
film_actor.show()

+----------------+---------------+--------------------+---------+----------------+-----------+
|actor_first_name|actor_last_name|           film_name|film_year|replacement_cost|language_id|
+----------------+---------------+--------------------+---------+----------------+-----------+
|        PENELOPE|        GUINESS|    ACADEMY DINOSAUR|     2006|           20.99|          1|
|        PENELOPE|        GUINESS|ANACONDA CONFESSIONS|     2006|            9.99|          1|
|        PENELOPE|        GUINESS|         ANGELS LIFE|     2006|           15.99|          1|
|        PENELOPE|        GUINESS|BULWORTH COMMANDM...|     2006|           14.99|          1|
|        PENELOPE|        GUINESS|       CHEAPER CLYDE|     2006|           23.99|          1|
|        PENELOPE|        GUINESS|  COLOR PHILADELPHIA|     2006|           19.99|          1|
|        PENELOPE|        GUINESS|     ELEPHANT TROJAN|     2006|           24.99|          1|
|        PENELOPE|        GUINESS| GLEAMING JAWBRE

In [8]:
film_actor\
    .where(film_actor.film_name == 'ACADEMY DINOSAUR')\
    .sort(F.desc('actor_last_name'))\
    .select('actor_first_name','actor_last_name','film_name','film_year','replacement_cost')\
    .show()

+----------------+---------------+----------------+---------+----------------+
|actor_first_name|actor_last_name|       film_name|film_year|replacement_cost|
+----------------+---------------+----------------+---------+----------------+
|         LUCILLE|          TRACY|ACADEMY DINOSAUR|     2006|           20.99|
|            MENA|         TEMPLE|ACADEMY DINOSAUR|     2006|           20.99|
|          SANDRA|           PECK|ACADEMY DINOSAUR|     2006|           20.99|
|          WARREN|          NOLTE|ACADEMY DINOSAUR|     2006|           20.99|
|           OPRAH|         KILMER|ACADEMY DINOSAUR|     2006|           20.99|
|            MARY|         KEITEL|ACADEMY DINOSAUR|     2006|           20.99|
|        PENELOPE|        GUINESS|ACADEMY DINOSAUR|     2006|           20.99|
|       CHRISTIAN|          GABLE|ACADEMY DINOSAUR|     2006|           20.99|
|            ROCK|        DUKAKIS|ACADEMY DINOSAUR|     2006|           20.99|
|          JOHNNY|           CAGE|ACADEMY DINOSAUR| 

In [9]:
film_actor.select('film_year').distinct().sort('film_year').show()



+---------+
|film_year|
+---------+
|     2006|
+---------+



                                                                                

In [10]:
film_actor\
    .where(F.col('replacement_cost') > 20)\
    .sort(F.desc('actor_last_name'))\
    .select('actor_first_name','actor_last_name','film_name','film_year','replacement_cost')\
    .show()

+----------------+---------------+--------------------+---------+----------------+
|actor_first_name|actor_last_name|           film_name|film_year|replacement_cost|
+----------------+---------------+--------------------+---------+----------------+
|          MINNIE|      ZELLWEGER|      ALICE FANTASIA|     2006|           23.99|
|         CAMERON|      ZELLWEGER|     MALLRATS UNITED|     2006|           25.99|
|          MINNIE|      ZELLWEGER|    BONNIE HOLOCAUST|     2006|           29.99|
|          MINNIE|      ZELLWEGER|      EVERYONE CRAFT|     2006|           29.99|
|          MINNIE|      ZELLWEGER|      EXPRESS LONELY|     2006|           23.99|
|          MINNIE|      ZELLWEGER|EXTRAORDINARY CON...|     2006|           29.99|
|          MINNIE|      ZELLWEGER|        JAPANESE RUN|     2006|           29.99|
|          MINNIE|      ZELLWEGER|      MADIGAN DORADO|     2006|           20.99|
|          MINNIE|      ZELLWEGER|       MONSOON CAUSE|     2006|           20.99|
|   

In [11]:
film_actor.select('actor_last_name').distinct().sort('actor_last_name').show(truncate=False, n=20)

+---------------+
|actor_last_name|
+---------------+
|AKROYD         |
|ALLEN          |
|ASTAIRE        |
|BACALL         |
|BAILEY         |
|BALE           |
|BALL           |
|BARRYMORE      |
|BASINGER       |
|BENING         |
|BERGEN         |
|BERGMAN        |
|BERRY          |
|BIRCH          |
|BLOOM          |
|BOLGER         |
|BRIDGES        |
|BRODY          |
|BULLOCK        |
|CAGE           |
+---------------+
only showing top 20 rows



                                                                                

In [12]:
film_actor.groupby(film_actor.actor_last_name).count().sort(F.desc('count')).show(20)



+---------------+-----+
|actor_last_name|count|
+---------------+-----+
|         KILMER|  134|
|          NOLTE|  124|
|         TEMPLE|  104|
|           TORN|   96|
|         KEITEL|   95|
|      DEGENERES|   93|
|         AKROYD|   90|
|        HOFFMAN|   88|
|        GARLAND|   88|
|          BERRY|   86|
|      JOHANSSON|   83|
|         WILLIS|   82|
|        GUINESS|   81|
|        HOPKINS|   81|
|      ZELLWEGER|   80|
|         HARRIS|   79|
|       WILLIAMS|   78|
|          DAVIS|   76|
|          ALLEN|   75|
|         BOLGER|   65|
+---------------+-----+
only showing top 20 rows



                                                                                

In [13]:
film_actor.select('*').show()

+----------------+---------------+--------------------+---------+----------------+-----------+
|actor_first_name|actor_last_name|           film_name|film_year|replacement_cost|language_id|
+----------------+---------------+--------------------+---------+----------------+-----------+
|        PENELOPE|        GUINESS|    ACADEMY DINOSAUR|     2006|           20.99|          1|
|        PENELOPE|        GUINESS|ANACONDA CONFESSIONS|     2006|            9.99|          1|
|        PENELOPE|        GUINESS|         ANGELS LIFE|     2006|           15.99|          1|
|        PENELOPE|        GUINESS|BULWORTH COMMANDM...|     2006|           14.99|          1|
|        PENELOPE|        GUINESS|       CHEAPER CLYDE|     2006|           23.99|          1|
|        PENELOPE|        GUINESS|  COLOR PHILADELPHIA|     2006|           19.99|          1|
|        PENELOPE|        GUINESS|     ELEPHANT TROJAN|     2006|           24.99|          1|
|        PENELOPE|        GUINESS| GLEAMING JAWBRE

In [14]:
film_actor.select('film_name', F.col('film_name'), film_actor.film_name, film_actor['film_name']).show()

+--------------------+--------------------+--------------------+--------------------+
|           film_name|           film_name|           film_name|           film_name|
+--------------------+--------------------+--------------------+--------------------+
|    ACADEMY DINOSAUR|    ACADEMY DINOSAUR|    ACADEMY DINOSAUR|    ACADEMY DINOSAUR|
|ANACONDA CONFESSIONS|ANACONDA CONFESSIONS|ANACONDA CONFESSIONS|ANACONDA CONFESSIONS|
|         ANGELS LIFE|         ANGELS LIFE|         ANGELS LIFE|         ANGELS LIFE|
|BULWORTH COMMANDM...|BULWORTH COMMANDM...|BULWORTH COMMANDM...|BULWORTH COMMANDM...|
|       CHEAPER CLYDE|       CHEAPER CLYDE|       CHEAPER CLYDE|       CHEAPER CLYDE|
|  COLOR PHILADELPHIA|  COLOR PHILADELPHIA|  COLOR PHILADELPHIA|  COLOR PHILADELPHIA|
|     ELEPHANT TROJAN|     ELEPHANT TROJAN|     ELEPHANT TROJAN|     ELEPHANT TROJAN|
| GLEAMING JAWBREAKER| GLEAMING JAWBREAKER| GLEAMING JAWBREAKER| GLEAMING JAWBREAKER|
|      HUMAN GRAFFITI|      HUMAN GRAFFITI|      HUMAN

In [15]:
language = spark.read\
    .option("header", True)\
    .option("inferSchema", True)\
    .csv("/datalake/bronze/samples/language.csv")

In [16]:
language.printSchema()

root
 |-- language_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [17]:
language = language\
    .withColumnRenamed('name', 'language')

In [18]:
language.show()

+-----------+--------------------+
|language_id|            language|
+-----------+--------------------+
|          1|English             |
|          2|Italian             |
|          3|Japanese            |
|          4|Mandarin            |
|          5|French              |
|          6|German              |
+-----------+--------------------+



In [19]:
fal = film_actor\
    .join(language, film_actor.language_id == language.language_id, 'left')\
    .select(film_actor.actor_first_name.alias('first_name')
            ,film_actor.actor_last_name.alias('last_name')
            ,'film_name','film_year','replacement_cost','language')\
    .show()

+----------+---------+--------------------+---------+----------------+--------------------+
|first_name|last_name|           film_name|film_year|replacement_cost|            language|
+----------+---------+--------------------+---------+----------------+--------------------+
|  PENELOPE|  GUINESS|    ACADEMY DINOSAUR|     2006|           20.99|English             |
|  PENELOPE|  GUINESS|ANACONDA CONFESSIONS|     2006|            9.99|English             |
|  PENELOPE|  GUINESS|         ANGELS LIFE|     2006|           15.99|English             |
|  PENELOPE|  GUINESS|BULWORTH COMMANDM...|     2006|           14.99|English             |
|  PENELOPE|  GUINESS|       CHEAPER CLYDE|     2006|           23.99|English             |
|  PENELOPE|  GUINESS|  COLOR PHILADELPHIA|     2006|           19.99|English             |
|  PENELOPE|  GUINESS|     ELEPHANT TROJAN|     2006|           24.99|English             |
|  PENELOPE|  GUINESS| GLEAMING JAWBREAKER|     2006|           25.99|English   

In [20]:
film_actor.createOrReplaceTempView('v_film_actor')

In [21]:
dfv = spark\
    .sql("SELECT * FROM v_film_actor WHERE film_name = 'ACADEMY DINOSAUR'")

In [22]:
dfv.show()

+----------------+---------------+----------------+---------+----------------+-----------+
|actor_first_name|actor_last_name|       film_name|film_year|replacement_cost|language_id|
+----------------+---------------+----------------+---------+----------------+-----------+
|        PENELOPE|        GUINESS|ACADEMY DINOSAUR|     2006|           20.99|          1|
|       CHRISTIAN|          GABLE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|         LUCILLE|          TRACY|ACADEMY DINOSAUR|     2006|           20.99|          1|
|          SANDRA|           PECK|ACADEMY DINOSAUR|     2006|           20.99|          1|
|          JOHNNY|           CAGE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|            MENA|         TEMPLE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|          WARREN|          NOLTE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|           OPRAH|         KILMER|ACADEMY DINOSAUR|     2006|           20.99|          1|

In [23]:
film_actor = film_actor.where(film_actor.film_name == 'ACADEMY DINOSAUR').show()

+----------------+---------------+----------------+---------+----------------+-----------+
|actor_first_name|actor_last_name|       film_name|film_year|replacement_cost|language_id|
+----------------+---------------+----------------+---------+----------------+-----------+
|        PENELOPE|        GUINESS|ACADEMY DINOSAUR|     2006|           20.99|          1|
|       CHRISTIAN|          GABLE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|         LUCILLE|          TRACY|ACADEMY DINOSAUR|     2006|           20.99|          1|
|          SANDRA|           PECK|ACADEMY DINOSAUR|     2006|           20.99|          1|
|          JOHNNY|           CAGE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|            MENA|         TEMPLE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|          WARREN|          NOLTE|ACADEMY DINOSAUR|     2006|           20.99|          1|
|           OPRAH|         KILMER|ACADEMY DINOSAUR|     2006|           20.99|          1|