In [1]:
from pyspark import SparkConf, SparkContext


In [2]:
# This function just creates a Python "dictionary" we can later
# use to convert movie ID's to movie names while printing out
# the final results.
def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [3]:
# Take each line of u.data and convert it to (movieID, (rating, 1.0))
# This way we can then add up all the ratings for each movie, and
# the total number of ratings for each movie (which lets us compute the average)
def parseInput(line):
    fields = line.split()
    return (int(fields[1]), (float(fields[2]), 1.0))


In [4]:
# The main script - create our SparkContext
conf = SparkConf().setAppName("WorstMovies")
sc = SparkContext(conf = conf)

# Load up our movie ID -> movie name lookup table
movieNames = loadMovieNames()

# Load up the raw u.data file
lines = sc.textFile("file:///C:/Users/SanmiLee/Desktop/SanmiLeeAI/spark/ml-100k/u.data")

In [5]:
# Convert to (movieID, (rating, 1.0))
movieRatings = lines.map(parseInput)

# Reduce to (movieID, (sumOfRatings, totalRatings))
ratingTotalsAndCount = movieRatings.reduceByKey(lambda movie1, movie2: ( movie1[0] + movie2[0], movie1[1] + movie2[1] ) )

# Filter out movies rated 10 or fewer times
popularTotalsAndCount = ratingTotalsAndCount.filter(lambda x: x[1][1] > 10)

# Map to (rating, averageRating)
averageRatings = popularTotalsAndCount.mapValues(lambda totalAndCount : totalAndCount[0] / totalAndCount[1])

# Sort by average rating
sortedMovies = averageRatings.sortBy(lambda x: x[1])

# Take the top 10 results
results = sortedMovies.take(10)


In [6]:
# Print them out:
for result in results:
    print(movieNames[result[0]], result[1])


Children of the Corn: The Gathering (1996) 1.3157894736842106
Body Parts (1991) 1.6153846153846154
Amityville II: The Possession (1982) 1.6428571428571428
Lawnmower Man 2: Beyond Cyberspace (1996) 1.7142857142857142
Robocop 3 (1993) 1.7272727272727273
Free Willy 3: The Rescue (1997) 1.7407407407407407
Gone Fishin' (1997) 1.8181818181818181
Ready to Wear (Pret-A-Porter) (1994) 1.8333333333333333
Solo (1996) 1.8333333333333333
Vampire in Brooklyn (1995) 1.8333333333333333
