In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
import sys

In [52]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

#### Criar sessão PySpark

In [53]:
spark = (
  SparkSession.builder
  .master('local')
  .appName('PySpark_01')
  .getOrCreate()
)

#### Criar DF / ler arquivo

In [54]:
df = spark.read.csv('./netflix_titles.csv', header=True, inferSchema=True)

#### Exibir DF

In [55]:
df.show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         NULL|Septem

#### Verificar tipos de colunas

In [56]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



#### Verificando dados nulos

In [57]:
for coluna in df.columns:
  print(coluna, df.filter(df[coluna].isNull()).count())

show_id 0
type 1
title 2
director 2636
cast 826
country 832
date_added 13
release_year 2
rating 6
duration 5
listed_in 3
description 3


#### Renomeando Colunas

In [None]:
portDf = df.withColumnRenamed('show_id', 'id')\
.withColumnRenamed('type', 'tipo')\
.withColumnRenamed('title', 'titulo')\
.withColumnRenamed('director', 'diretor')\
.withColumnRenamed('cast' ,'atores_principais')\
.withColumnRenamed('country', 'pais')\
.withColumnRenamed('date_added', 'data_dicionado')\
.withColumnRenamed('release_year', 'ano_lancamento')\
.withColumnRenamed('rating', 'classificacao')\
.withColumnRenamed('duration', 'duracao')\
.withColumnRenamed('listed_in', 'categoria')\
.withColumnRenamed('description', 'descricao').show(5)

#### Selecionar Colunas

In [None]:
df.select('title', 'description').show(5)

In [None]:
df.select(col('title'), col('description')).show(5)

In [None]:
df.select(df['title']).show(5)

#### Selecionar Colunas com ALIAS
**Pelo primeiro método de select, o alias não funciona por se tratar de uma string**

In [None]:
df.select(col('title').alias('titulo')).show(5)

In [None]:
df.select('title director cast'.split()).show(5)

#### Organizar Select

In [None]:
df.select('title', 'type', 'country').show(5)

#### Filtrar DF
**Se tiver espaços, você deve utilizar a função col no nome da coluna**

In [87]:
df.filter('country = "Brazil"').show(10)

+-------+-------+--------------------+--------------------+--------------------+-------+-----------------+------------+------+--------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|country|       date_added|release_year|rating|duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+-------+-----------------+------------+------+--------+--------------------+--------------------+
|   s303|  Movie|City of God: 10 Y...|Cavi Borges, Luci...|                NULL| Brazil|   August 5, 2021|        2013| TV-MA|  69 min|Documentaries, In...|"A decade after t...|
|   s651|  Movie|O Vendedor de Sonhos|     Jayme Monjardim|César Troncoso, D...| Brazil|    June 22, 2021|        2016| TV-14|  96 min|Dramas, Internati...|A disillusioned p...|
|   s784|  Movie|            Carnaval|        Leandro Neri|Giovana Cordeiro,...| Brazil|     June 2, 2021|    