In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, IntegerType, DateType, StructField, FloatType
import pyspark.sql.functions as f

In [None]:
data_path = '/content/movies.csv'

In [None]:
            # .config("spark.jars", "postgresql-42.7.4.jar") \
            # .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.1") \

In [None]:
spark = SparkSession \
            .builder \
            .appName("movies_app") \
            .master("local") \
            .getOrCreate()


In [None]:
df = spark \
    .read \
    .option("header", "true") \
    .option("multiline", "true") \
    .option("sep", ";") \
    .option("quote", "*") \
    .option("dateFormat", "M/d/y") \
    .option("inferSchema", "true") \
    .csv(data_path)

In [None]:
df.show(10,20)

+--------------------------------------------------------------------------------------------+
|Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year|
+--------------------------------------------------------------------------------------------+
|                                                                        Zack and Miri Mak...|
|                                                                        Youth in Revolt,C...|
|                                                                        You Will Meet a T...|
|                                                                        When in Rome,Come...|
|                                                                        What Happens in V...|
|                                                                        Water For Elephan...|
|                                                                        WALL-E,Animation,...|
|                                                 

In [None]:
df.printSchema()

root
 |-- Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year: string (nullable = true)



In [None]:
schema = StructType([
                StructField("Film", StringType(), nullable=True),
                StructField("Genre", StringType(), nullable=True),
                StructField("Lead Studio", StringType(), nullable=True),
                StructField("Audience score %", IntegerType(), nullable=True),
                StructField("Profitability", FloatType(), nullable=True),
                StructField("Rotten Tomatoes %", IntegerType(), nullable=True),
                StructField("Worldwide Gross", StringType(), nullable=True),
                StructField("Year", IntegerType(), nullable=True)])

In [None]:
df = spark \
    .read \
    .option("header", "true") \
    .option("multiline", "true") \
    .option("sep", ",") \
    .option("inferSchema", "true") \
    .schema(schema)\
    .csv(data_path)

In [None]:
df.show(10,20)

+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+
|                Film|    Genre|         Lead Studio|Audience score %|Profitability|Rotten Tomatoes %|Worldwide Gross|Year|
+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+
|Zack and Miri Mak...|  Romance|The Weinstein Com...|              70|    1.7475417|               64|        $41.94 |2008|
|     Youth in Revolt|   Comedy|The Weinstein Com...|              52|         1.09|               68|        $19.62 |2010|
|You Will Meet a T...|   Comedy|         Independent|              35|    1.2118182|               43|        $26.66 |2010|
|        When in Rome|   Comedy|              Disney|              44|          0.0|               15|        $43.04 |2010|
|What Happens in V...|   Comedy|                 Fox|              72|    6.2676473|               28|       $219.37 |2008|
| Water 

In [None]:
df2 = df.withColumn('world_gross_f',f.regexp_replace('Worldwide Gross', '[$,]', '').cast('double'))
df2.show()

+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+-------------+
|                Film|    Genre|         Lead Studio|Audience score %|Profitability|Rotten Tomatoes %|Worldwide Gross|Year|world_gross_f|
+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+-------------+
|Zack and Miri Mak...|  Romance|The Weinstein Com...|              70|    1.7475417|               64|        $41.94 |2008|        41.94|
|     Youth in Revolt|   Comedy|The Weinstein Com...|              52|         1.09|               68|        $19.62 |2010|        19.62|
|You Will Meet a T...|   Comedy|         Independent|              35|    1.2118182|               43|        $26.66 |2010|        26.66|
|        When in Rome|   Comedy|              Disney|              44|          0.0|               15|        $43.04 |2010|        43.04|
|What Happens in V...|   Comedy|  

In [None]:
df2.withColumn('words', f.explode(f.split(f.lower(f.col('Film')), ' ')))\
   .groupBy('words')\
   .count()\
   .sort('count', ascending=False)\
   .show(100)

+-----------+-----+
|      words|count|
+-----------+-----+
|        the|   14|
|        and|    9|
|         of|    6|
|          a|    5|
|        you|    4|
|       love|    4|
|         in|    4|
|        day|    3|
|        for|    3|
|       city|    3|
|          i|    3|
|     juliet|    3|
|        sex|    3|
|       mia!|    2|
|        not|    2|
|        new|    2|
|     gnomeo|    2|
|      mamma|    2|
|         me|    2|
|    married|    2|
|       year|    2|
|       just|    2|
|         my|    2|
|    happens|    2|
|         to|    2|
|   twilight|    2|
|      monte|    1|
|      carlo|    1|
|     marley|    1|
|  fireproof|    1|
|        did|    1|
|        two|    1|
|       miri|    1|
|     wright|    1|
|   borrowed|    1|
|       eyre|    1|
|   infinite|    1|
|     wall-e|    1|
|       moon|    1|
|       luck|    1|
|       will|    1|
|    serious|    1|
|       wife|    1|
|christmases|    1|
|   waitress|    1|
|     rachel|    1|
|    letters|    1|
