## Spark

### Find Lowest rated Movies in the IMDB ml-100k dataset

In [1]:
from pyspark import SparkConf, SparkContext


In [2]:
# This function just creates a Python "dictionary" we can later
# use to convert movie ID's to movie names while printing out
# the final results.

def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


In [3]:
# Take each line of u.data and convert it to (movieID, (rating, 1.0))
# This way we can then add up all the ratings for each movie, and
# the total number of ratings for each movie (which lets us compute the average)

def parseInput(line):
    fields = line.split()
    return (int(fields[1]), (float(fields[2]), 1.0))


In [4]:
# The main script - create our SparkContext
conf = SparkConf().setAppName("WorstMovies")
sc = SparkContext(conf = conf)

In [5]:
# Load up our movie ID -> movie name lookup table
movieNames = loadMovieNames()

# Load up the raw u.data file
#lines = sc.textFile("hdfs:///user/maria_dev/u.data")
lines = sc.textFile("file:///C:/Users/SanmiLee/Desktop/SanmiLeeAI/spark/ml-100k/u.data")


In [6]:
# Convert to (movieID, (rating, 1.0))
movieRatings = lines.map(parseInput)

In [7]:
# Reduce to (movieID, (sumOfRatings, totalRatings))
ratingTotalsAndCount = movieRatings.reduceByKey(lambda movie1, movie2: ( movie1[0] + movie2[0], movie1[1] + movie2[1] ) )


In [8]:
# Map to (rating, averageRating)
averageRatings = ratingTotalsAndCount.mapValues(lambda totalAndCount : totalAndCount[0] / totalAndCount[1])

In [9]:
# Sort by average rating
sortedMovies = averageRatings.sortBy(lambda x: x[1])

In [10]:
# Take the top 10 results
results = sortedMovies.take(10)

In [11]:
# Print them out:
for result in results:
    print(movieNames[result[0]], result[1])

Amityville: Dollhouse (1996) 1.0
Somebody to Love (1994) 1.0
Every Other Weekend (1990) 1.0
Homage (1995) 1.0
3 Ninjas: High Noon At Mega Mountain (1998) 1.0
Bird of Prey (1996) 1.0
Power 98 (1995) 1.0
Beyond Bedlam (1993) 1.0
Falling in Love Again (1980) 1.0
T-Men (1947) 1.0


In [None]:

# Stop the session
spark.stop()