In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Create Spark session
spark = SparkSession.builder \
    .appName("Load CSV Example") \
    .getOrCreate()

# Define schema
schema = StructType([
    StructField("Film", StringType(), True),
    StructField("Genre", StringType(), True),
    StructField("Lead Studio", StringType(), True),
    StructField("Audience score %", FloatType(), True),
    StructField("Profitability", FloatType(), True),
    StructField("Rotten Tomatoes %", FloatType(), True),
    StructField("Worldwide Gross", FloatType(), True),
    StructField("Year", IntegerType(), True)
])

# Load CSV file into DataFrame
df = spark.read.csv("movies.csv", header=True, schema=schema)

# # Show DataFrame content
# df.show()

# # Print DataFrame schema
# df.printSchema()

# Function to count word frequencies in film names
def count_words(df):
    # Split the film names into words and explode into separate rows
    words_df = df.select(F.explode(F.split(F.col("Film"), " ")).alias("word"))

    # Group by word and count occurrences
    word_count_df = words_df.groupBy("word").count()

    # Order by count in descending order and get the most frequent word
    most_frequent_word_df = word_count_df.orderBy(F.desc("count"))

    return most_frequent_word_df

# Count word frequencies and show the results
most_frequent_words = count_words(df)
most_frequent_words.show()

+--------+-----+
|    word|count|
+--------+-----+
|     and|    9|
|     The|    9|
|      of|    6|
|     the|    5|
|      in|    4|
|    Love|    4|
|     You|    4|
|  Juliet|    3|
|       I|    3|
|     Day|    3|
|       a|    3|
|     Sex|    3|
|    City|    3|
| Married|    2|
|      Me|    2|
|     For|    2|
|Twilight|    2|
|     New|    2|
|   Mamma|    2|
|  Gnomeo|    2|
+--------+-----+
only showing top 20 rows

