## 2021486 Assignment-3 CE408

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, avg, max, min, split, length, collect_list

### Function to create spark session

In [4]:
def create_spark_session():
    return SparkSession.builder \
        .appName("Netflix Dataset EDA") \
        .getOrCreate()

### Function to Load Dataset

In [5]:
def load_netflix_dataset(spark, file_path):
    return spark.read.csv(file_path, header=True, inferSchema=True)

### Function for Data Analysis

In [6]:
def perform_eda(netflix_df):
    print("1. Total Content by Content Type:")
    content_by_type = netflix_df.groupBy("type") \
        .agg(count("*").alias("total_count")) \
        .orderBy(desc("total_count"))
    content_by_type.show()

    print("\n2. Most Frequent Actors in the Dataset:")
    frequent_actors = netflix_df.select(split(col("cast"), ", ").alias("actors")) \
        .withColumn("actor", col("actors").getItem(0)) \
        .groupBy("actor") \
        .agg(count("*").alias("appearance_count")) \
        .orderBy(desc("appearance_count")) \
        .limit(10)
    frequent_actors.show()

    print("\n3. Yearly Trend of Content Addition:")
    yearly_trend = netflix_df.groupBy("release_year") \
        .agg(count("*").alias("content_count")) \
        .orderBy("release_year")
    yearly_trend.show()

    print("\n4. Top 10 Countries with the Most Content:")
    country_content = netflix_df.groupBy("country") \
        .agg(count("*").alias("content_count")) \
        .orderBy(desc("content_count")) \
        .limit(10)
    country_content.show()

    print("\n5. Analysis of Ratings and Their Distribution:")
    rating_distribution = netflix_df.groupBy("rating") \
        .agg(count("*").alias("rating_count")) \
        .orderBy(desc("rating_count"))
    rating_distribution.show()

    print("\n6. Top 10 Titles with the Shortest Names:")
    shortest_titles = netflix_df \
        .withColumn("title_length", length(col("title"))) \
        .orderBy("title_length") \
        .select("title", "title_length") \
        .limit(10)
    shortest_titles.show()

    print("\n7. Correlation Between Release Year and Content Count:")
    yearly_correlation = netflix_df.groupBy("release_year") \
        .agg(count("*").alias("content_count")) \
        .orderBy("release_year")
    yearly_correlation.show()


### Creating Spark Session

In [7]:
spark = create_spark_session()

### Performing the Analysis

In [8]:
 # Get the current directory
dataset_path = 'netflix_titles.csv' 

# Load Netflix Dataset
netflix_df = load_netflix_dataset(spark, dataset_path)

# Perform EDA
perform_eda(netflix_df)

1. Total Content by Content Type:
+-------------+-----------+
|         type|total_count|
+-------------+-----------+
|        Movie|       6131|
|      TV Show|       2676|
|         NULL|          1|
|William Wyler|          1|
+-------------+-----------+


2. Most Frequent Actors in the Dataset:
+------------------+----------------+
|             actor|appearance_count|
+------------------+----------------+
|              NULL|             826|
|    Shah Rukh Khan|              26|
|      Akshay Kumar|              23|
|  Amitabh Bachchan|              20|
|      Adam Sandler|              20|
|David Attenborough|              20|
|      Vatsal Dubey|              18|
|        Ajay Devgn|              16|
|      Nicolas Cage|              16|
|      Michela Luci|              14|
+------------------+----------------+


3. Yearly Trend of Content Addition:
+-----------------+-------------+
|     release_year|content_count|
+-----------------+-------------+
|             NULL|        

In [23]:
spark.stop()