In [3]:
import boto3
import tempfile
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NetflixAnalysisBoto3") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.files.maxPartitionBytes", "128MB") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

In [4]:
spark.conf.get("spark.executor.cores")

'2'

In [5]:
# Carga del archivo CSV a Spark
netflix_titles = "netflix_titles.csv"
df_netflix=spark.read.csv(netflix_titles,header=True,inferSchema=True)

In [6]:
df_netflix.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                NULL|       United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                NULL|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglan

In [7]:
df_netflix.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [8]:
from pyspark.sql.functions import corr, col,count,when


In [9]:
df_netflix.groupBy("cast").count().show()


+--------------------+-----+
|                cast|count|
+--------------------+-----+
|Suliane Brahim, S...|    1|
|Fatih Şahin, Ece ...|    1|
|Chris O'Dowd, Kat...|    1|
|Nkem Owoh, Mara D...|    2|
|Nancy Isime, Jide...|    1|
|Katharine McPhee,...|    1|
|Shun Oguri, Mirai...|    1|
|Miranda Cosgrove,...|    1|
|Pierre-Alain de G...|    1|
|Millie Bobby Brow...|    1|
|Ronwaldo Martin, ...|    1|
|Paris Berelc, Isa...|    1|
|Amber Stevens Wes...|    1|
|Megumi Ogata, Kot...|    1|
|Madeleine Sami, J...|    1|
|Donnie Yen, Nicho...|    1|
|Will Friedle, Dar...|    1|
|Sofía Niño de Rivera|    2|
|          Snoop Dogg|    1|
|Zeenat Aman, Pran...|    1|
+--------------------+-----+
only showing top 20 rows



In [10]:
df_netflix.groupBy("duration").count().show()

+-----------------+-----+
|         duration|count|
+-----------------+-----+
|          100 min|  108|
|          153 min|   11|
|           71 min|   27|
|           56 min|   12|
| Donnell Rawlings|    1|
|           13 min|    3|
|          119 min|   63|
|           33 min|    6|
|          165 min|    8|
|       10 Seasons|    7|
|           12 min|    3|
|          204 min|    2|
|          142 min|   13|
|          173 min|    6|
|           27 min|    3|
|          157 min|    6|
|           30 min|    6|
|           39 min|    2|
|        8 Seasons|   17|
|           82 min|   52|
+-----------------+-----+
only showing top 20 rows



In [14]:
df_netflix.groupBy("country").agg(count("*").alias("cantidad")).show()

+--------------------+--------+
|             country|cantidad|
+--------------------+--------+
|Peru, United Stat...|       1|
|India, United Kin...|       1|
|Japan, Canada, Un...|       1|
|United Kingdom, C...|       1|
|      India, Germany|       2|
|South Africa, Uni...|       1|
|              Russia|      15|
|United Kingdom, G...|       1|
|Chile, United Sta...|       1|
|South Africa, Angola|       1|
|United States, Po...|       1|
|  Philippines, Qatar|       1|
|Hong Kong, China,...|       1|
|  Germany, Sri Lanka|       1|
|Denmark, France, ...|       1|
|United States, Fr...|       1|
|United Kingdom, N...|       2|
|Australia, United...|       2|
|Brazil, France, G...|       1|
|United States, Ir...|       3|
+--------------------+--------+
only showing top 20 rows

