# Spark Movie Recommendation

Disciplina de programação paralela CEFET-RJ  

Discentes: Nadinne Guimarães Holanda e Rafael Assis Mello Pereira Dias

## Import libs

In [42]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from enum import Enum
import numpy as np

In [43]:
import pandas as pd
# Import SparkSession
from pyspark.sql import SparkSession
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import string
import itertools
from notebooks.data import text_mining

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# Build the SparkSession
spark = SparkSession.builder \
   .master("local[*]") \
   .appName("Linear Regression Model") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
   
sc = spark.sparkContext

## Dataset

See more in: https://www.kaggle.com/shivamb/netflix-shows

In [45]:
df = spark.read.csv("./notebooks/data/netflix_titles.csv",header=True)

In [46]:
descriptions = df.select("description").rdd.flatMap(lambda x: x).collect()[:1000]

In [47]:
len(descriptions)

1000

## Functions to clean text data

In [48]:
def custom_tokenize(text):
    if not text:
        print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return word_tokenize(text)

In [49]:
def pipeline_cleaning(text):
    tokens = custom_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

In [50]:
rdd = sc.parallelize(descriptions).map(lambda x: pipeline_cleaning(x))

## Computing time for pipeline cleaning

In [51]:
%%time
tokenized_documents = rdd.collect()

CPU times: user 9.23 ms, sys: 7.75 ms, total: 17 ms
Wall time: 1.05 s


In [52]:
len(tokenized_documents)

1000

In [53]:
vocab = set(list(itertools.chain.from_iterable(tokenized_documents)))

In [54]:
len(vocab)

5754

In [55]:
DF = text_mining.calculate_df(vocab=vocab, corpus=tokenized_documents)
IDF = text_mining.calculate_idf(VOCAB=vocab, DF=DF, corpus_size=len(tokenized_documents))
TF = text_mining.calculate_tf(corpus=tokenized_documents, VOCAB=vocab)
TF_IDF = text_mining.calculate_tf_idf(TF=TF, IDF=IDF)

In [56]:
vectors = []
for key in TF_IDF:
    vectors.append(np.array(list(TF_IDF[key].values())))

In [57]:
len(vectors)

1000

In [58]:
import random
index = random.randint(0, 1000)
print(index)

459


In [59]:
movie = vectors.pop(index)

In [60]:
choosed_movie = df.collect()[index]

In [61]:
def get_similarity_items(n=1, similarity=None):
    return sorted(similarity, key=lambda x: x[0], reverse=True)[:n]

## Computing time for cosine similarity calculation

In [62]:
%%time
rdd = sc.parallelize(vectors).map(lambda vector: text_mining.calc_cosine_similarity([movie], [vector]))
similarity = rdd.zipWithIndex().collect()
items = get_similarity_items(n=3, similarity=similarity)
movie_items_index = [item[1] for item in items]

CPU times: user 28.5 ms, sys: 67.7 ms, total: 96.3 ms
Wall time: 673 ms


In [63]:
print(f"Given a movie: {choosed_movie['title']}")

Given a movie: American Experience: Ruby Ridge


In [64]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Movie and Tv Shows you may also like"]

for index in movie_items_index:
    m = df.collect()[index]
    x.add_row([m["title"]])

In [65]:
print(x)

+--------------------------------------+
| Movie and Tv Shows you may also like |
+--------------------------------------+
|             Arctic Heart             |
|            Aagey Se Right            |
|                6 Days                |
+--------------------------------------+
