In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql import Row
import collections




In [2]:
with open("movies.dat", encoding="ISO-8859-1") as f:
  for line in f:
    print(line.strip())


1::Movie 1 (2001)::Action
2::Movie 2 (2002)::Action
3::Movie 3 (2003)::Romance
4::Movie 4 (2004)::Drama
5::Movie 5 (2005)::Drama
6::Movie 6 (2006)::Thriller
7::Movie 7 (2007)::Comedy
8::Movie 8 (2008)::Comedy
9::Movie 9 (2009)::Sci-Fi
10::Movie 10 (2000)::Thriller
11::Movie 11 (2001)::Sci-Fi
12::Movie 12 (2002)::Comedy
13::Movie 13 (2003)::Sci-Fi
14::Movie 14 (2004)::Action
15::Movie 15 (2005)::Comedy
16::Movie 16 (2006)::Comedy
17::Movie 17 (2007)::Action
18::Movie 18 (2008)::Comedy
19::Movie 19 (2009)::Action
20::Movie 20 (2000)::Sci-Fi
21::Movie 21 (2001)::Thriller
22::Movie 22 (2002)::Sci-Fi
23::Movie 23 (2003)::Thriller
24::Movie 24 (2004)::Sci-Fi
25::Movie 25 (2005)::Sci-Fi
26::Movie 26 (2006)::Romance
27::Movie 27 (2007)::Drama
28::Movie 28 (2008)::Romance
29::Movie 29 (2009)::Thriller
30::Movie 30 (2000)::Drama
31::Movie 31 (2001)::Drama
32::Movie 32 (2002)::Sci-Fi
33::Movie 33 (2003)::Comedy
34::Movie 34 (2004)::Thriller
35::Movie 35 (2005)::Drama
36::Movie 36 (2006)::Thriller

In [3]:
with open("ratings.dat") as f:
  for line in f:
    print(line.strip())

6::36::5::1131283453
14::42::2::1467785159
22::30::4::1155486604
11::83::1::1196651871
20::91::3::1225595582
17::62::1::1315983126
28::85::1::1038001970
15::20::1::1356307144
13::87::2::1235367254
26::78::5::1053997241
23::95::2::1152275754
10::87::5::1114159216
15::100::3::1330050989
7::41::2::1456277037
28::21::1::1461972621
27::11::1::1426963347
15::38::5::1114132855
4::40::5::1300272852
4::78::2::1050320375
2::51::2::1270291967
18::1::4::1301663549
13::89::5::1204800860
25::58::3::1156153431
3::41::4::1100611986
1::28::5::1195339058
29::25::3::1371078711
6::55::3::1368162769
7::74::4::1063981795
27::36::4::1227795889
7::46::1::1163463566
6::15::2::1243084148
3::53::2::1129081451
3::47::5::1444649600
14::46::4::1209273991
7::12::4::1241725768
25::8::3::1181016630
15::81::4::1392406578
14::25::4::1300761211
19::75::5::1364232418
17::61::4::1454560905
22::18::2::1403143400
24::50::4::1388957954
9::65::2::1108637428
14::22::4::1498050940
9::53::1::1358251615
21::60::2::1069562658
18::1

In [4]:
from pyspark.sql import SparkSession

In [5]:
# Start Spark
spark = SparkSession.builder.appName("BroadcastMovieNames").getOrCreate()
sc = spark.sparkContext

In [8]:
# ==========================
# Load Movie Names Function
# ==========================
def loadMovieNames():
    movieNames = {}
    with open("movies.dat", encoding="ISO-8859-1") as f:
        for line in f:
            fields = line.strip().split("::")
            if len(fields) >= 2:
                movieId = int(fields[0])
                movieName = fields[1]
                movieNames[movieId] = movieName
    return movieNames


In [9]:
# Broadcast movie names
movieNamesDict = loadMovieNames()
movieNamesBroadcast = sc.broadcast(movieNamesDict)


In [10]:
# ==========================
# Load Ratings Dataset
# ==========================
ratings = sc.textFile("ratings.dat")

ratingsParsed = ratings.map(lambda l: l.split("::")) \
                       .map(lambda f: (int(f[0]), int(f[1]), float(f[2])))



In [11]:
# ==========================
# Replace movieId with Names
# ==========================
ratingsWithNames = ratingsParsed.map(
    lambda x: (x[0], movieNamesBroadcast.value.get(x[1], "Unknown"), x[2])
)

# Show sample results
for result in ratingsWithNames.take(10):
    print(result)

# Stop Spark
spark.stop()

(6, 'Movie 36 (2006)', 5.0)
(14, 'Movie 42 (2002)', 2.0)
(22, 'Movie 30 (2000)', 4.0)
(11, 'Movie 83 (2003)', 1.0)
(20, 'Movie 91 (2001)', 3.0)
(17, 'Movie 62 (2002)', 1.0)
(28, 'Movie 85 (2005)', 1.0)
(15, 'Movie 20 (2000)', 1.0)
(13, 'Movie 87 (2007)', 2.0)
(26, 'Movie 78 (2008)', 5.0)
