In [1]:
import os 

os.environ['PYSPARK_PYTHON']='/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON']='/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']='notebook'

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("movie_analysis_with_names").master("local[*]").getOrCreate()

In [2]:
import codecs

def loadMovieNames():
    movieNames = {}
    with codecs.open("./ml-100k/u.item",encoding = "ISO-8859-1", errors = "ignore") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

nameDict = spark.sparkContext.broadcast(loadMovieNames())


In [3]:
from pyspark.sql.types import IntegerType, LongType, StructField, StructType

schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("movie_id", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True)
])

movies_data = spark.read.csv("./ml-100k/u.data", sep="\t", schema= schema)

In [4]:
from pyspark.sql.functions import count, udf, desc

movie_counts= movies_data.groupBy("movie_id").agg(count("movie_id").alias("count"))
movie_counts.show()

+--------+-----+
|movie_id|count|
+--------+-----+
|     496|  231|
|     471|  221|
|     463|   71|
|     148|  128|
|    1342|    2|
|     833|   49|
|    1088|   13|
|    1591|    6|
|    1238|    8|
|    1580|    1|
|    1645|    1|
|     392|   68|
|     623|   39|
|     540|   43|
|     858|    3|
|     737|   59|
|     243|  132|
|    1025|   44|
|    1084|   21|
|    1127|   11|
+--------+-----+
only showing top 20 rows



In [5]:
def lookupName(movieId):
    return nameDict.value[movieId]

lookupNameUDF = udf(lookupName)

In [6]:
movie_with_names = movie_counts.withColumn("Title", lookupNameUDF(movie_counts["movie_id"])).orderBy(desc("count"))

In [7]:
movie_with_names.show()

+--------+-----+--------------------+
|movie_id|count|               Title|
+--------+-----+--------------------+
|      50|  583|    Star Wars (1977)|
|     258|  509|      Contact (1997)|
|     100|  508|        Fargo (1996)|
|     181|  507|Return of the Jed...|
|     294|  485|    Liar Liar (1997)|
|     286|  481|English Patient, ...|
|     288|  478|       Scream (1996)|
|       1|  452|    Toy Story (1995)|
|     300|  431|Air Force One (1997)|
|     121|  429|Independence Day ...|
|     174|  420|Raiders of the Lo...|
|     127|  413|Godfather, The (1...|
|      56|  394| Pulp Fiction (1994)|
|       7|  392|Twelve Monkeys (1...|
|      98|  390|Silence of the La...|
|     237|  384|Jerry Maguire (1996)|
|     117|  378|    Rock, The (1996)|
|     172|  367|Empire Strikes Ba...|
|     222|  365|Star Trek: First ...|
|     204|  350|Back to the Futur...|
+--------+-----+--------------------+
only showing top 20 rows

