In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession, Row, functions

In [24]:
def loadMovieNames():
    '''Map Movies Ids to Titles for ease of reading'''
    movieNames = {}
    with open("../../SparkData/ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [4]:
# create a sparksession (windows specific)
spark = SparkSession.builder \
    .master("local") \
    .appName("PopularMoviesV2") \
    .config("spark.sql.warehouse.dir", "file:///C:/temp") \
    .getOrCreate()

In [26]:
nameDict = loadMovieNames()

In [6]:
# grab raw data
lines = spark.sparkContext.textFile("../SparkData/ml-100k/u.data")

In [7]:
# convert raw data to an RDD of row objects
movies = lines.map(lambda x: Row( movieID = int(x.split()[1]) ) )

In [8]:
# convert RDD of row objects to a DataFrame
movieDataset = spark.createDataFrame(movies)

In [9]:
# use SQL-style functions to sort movies by popularity
topMovieIDs = movieDataset.groupBy("movieID").count() \
    .orderBy("count", ascending=False).cache()

In [10]:
topMovieIDs.show()

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
|    174|  420|
|    127|  413|
|     56|  394|
|      7|  392|
|     98|  390|
|    237|  384|
|    117|  378|
|    172|  367|
|    222|  365|
|    313|  350|
+-------+-----+
only showing top 20 rows



In [11]:
# grab the top 10
top10 = topMovieIDs.take(10)

In [28]:
# print results 
for result in top10:
    # access names through the earlier created dictionary
    print("{}: {}".format(nameDict[result[0]], result[1]))

Star Wars (1977): 583
Contact (1997): 509
Fargo (1996): 508
Return of the Jedi (1983): 507
Liar Liar (1997): 485
English Patient, The (1996): 481
Scream (1996): 478
Toy Story (1995): 452
Air Force One (1997): 431
Independence Day (ID4) (1996): 429
