In [1]:
#set JAVA_HOME
%env JAVA_HOME = /Library/Java/JavaVirtualMachines/jdk-14.0.1.jdk/Contents/Home

env: JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-14.0.1.jdk/Contents/Home


In [2]:
#Setting Spark Context
from pyspark import SparkContext

sc = SparkContext()
sc.addPyFile('/Users/meghashishodia/532/spark/jars/graphframes-0.8.2-spark3.2-s_2.12.jar')

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .master("local[1]")\
        .appName("Python Spark SQL") \
        .config("spark.executor.heartbeatInterval", "200000") \
        .config("spark.network.timeout", "300000") \
        .config("spark.driver.memory", "15g") \
        .getOrCreate()

In [4]:
#Loading data produced after precprocessing as a dataframe
path = "data/graphxdata.csv"
df = spark.read.format("true").option("header", "true").csv(path)
df

DataFrame[User_Id: string, Movie_Title: string, Movie_Id: string, Genres: string, Gender: string, Age: string, Occupation: string, Rating_num: string, count_movieRatedTotal: string, count_userRatedTotal: string]

In [5]:
#Create Vertices and Edges DF
from numpy.random import default_rng
import pandas as pd

df_size = df.count()

rng = default_rng()
numbers = rng.choice(df_size*2, size=df_size, replace=False)

df1 = pd.DataFrame(numbers, columns=['id'])

df2 = spark.createDataFrame(df1)

df = df.withColumn("VertexId", df.User_Id)

In [6]:
#Vertices DF
vertices = df[['VertexId', 'User_Id','Movie_Title', 'Movie_Id', 'Genres', 'Gender', 'Age', 'Occupation', 'count_movieRatedTotal']].toDF("id", 'User_Id','Movie_Title', 'Movie_Id', 'Genres', 'Gender', 'Age', 'Occupation', 'count_movieRatedTotal')
vertices

DataFrame[id: string, User_Id: string, Movie_Title: string, Movie_Id: string, Genres: string, Gender: string, Age: string, Occupation: string, count_movieRatedTotal: string]

In [7]:
#Edges DF
edges = df[['User_Id', 'Movie_Title', 'Rating_num']].toDF("src", "dst", "relationship")
edges

DataFrame[src: string, dst: string, relationship: string]

In [8]:
#create graph from vertices and edges DF
import graphframes
from graphframes import GraphFrame
graph = GraphFrame(vertices, edges)

In [9]:
#Vertices
graph.vertices.show()

+----+-------+--------------------+--------+--------------------+------+---+----------+---------------------+
|  id|User_Id|         Movie_Title|Movie_Id|              Genres|Gender|Age|Occupation|count_movieRatedTotal|
+----+-------+--------------------+--------+--------------------+------+---+----------+---------------------+
| 296|    296|Snow White and th...|     594|Animation|Childre...|     M| 50|         5|                  763|
| 296|    296|       Psycho (1960)|    1219|     Horror|Thriller|     M| 50|         5|                 1263|
| 829|    829|Snow White and th...|     594|Animation|Childre...|     M|  1|        19|                  763|
| 829|    829|In the Heat of th...|    1950|       Drama|Mystery|     M|  1|        19|                  348|
| 829|    829|       Psycho (1960)|    1219|     Horror|Thriller|     M|  1|        19|                 1263|
| 829|    829|   Annie Hall (1977)|    1230|      Comedy|Romance|     M|  1|        19|                 1334|
|1436|   1

In [10]:
#Edges
graph.edges.show()

+----+--------------------+------------+
| src|                 dst|relationship|
+----+--------------------+------------+
| 296|Snow White and th...|           3|
| 296|       Psycho (1960)|           5|
| 829|Snow White and th...|           3|
| 829|In the Heat of th...|           4|
| 829|       Psycho (1960)|           5|
| 829|   Annie Hall (1977)|           5|
|1436|    Elizabeth (1998)|           5|
|2069|       Psycho (1960)|           4|
|2088|Night of the Livi...|           4|
|2088|In the Heat of th...|           4|
|2088|       Psycho (1960)|           5|
|2088|   Annie Hall (1977)|           4|
|2162|       Psycho (1960)|           3|
|2294|Snow White and th...|           3|
|4032|7th Voyage of Sin...|           5|
|4821|Odd Couple II, Th...|           1|
|4937|    Elizabeth (1998)|           5|
|5645|Night of the Livi...|           4|
|5925|Snow White and th...|           3|
|5925|   Annie Hall (1977)|           4|
+----+--------------------+------------+
only showing top

In [11]:
#InDegree of Movies
results = graph.inDegrees
results.sort("inDegree").show()

+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|In the Realm of t...|       1|
|       Gabbeh (1996)|       1|
|Monument Ave. (1998)|       1|
|Draughtsman's Con...|       1|
|    Firelight (1997)|       1|
|In the Bleak Midw...|       1|
|Adrenalin: Fear t...|       1|
|Swept from the Se...|       1|
|         Fall (1997)|       1|
|   Miss Julie (1999)|       1|
|Adventures of Elm...|       1|
|Brother, Can You ...|       1|
|  Light It Up (1999)|       1|
|Dreaming of Josep...|       1|
|Little Nemo: Adve...|       1|
|Time Regained (Le...|       1|
|  Hollow Reed (1996)|       1|
| Secret Agent (1936)|       1|
| Queens Logic (1991)|       1|
|American Dream (1...|       1|
+--------------------+--------+
only showing top 20 rows



In [12]:
#Create as a View to run SQL queries
results.createOrReplaceTempView("INDEGREE")
#Best Movies
spark.sql("select id, inDegree from INDEGREE ORDER BY inDegree desc limit 5").show(truncate=False)

+-----------------------------------------------------+--------+
|id                                                   |inDegree|
+-----------------------------------------------------+--------+
|American Beauty (1999)                               |464     |
|Star Wars: Episode V - The Empire Strikes Back (1980)|445     |
|Star Wars: Episode IV - A New Hope (1977)            |424     |
|Star Wars: Episode VI - Return of the Jedi (1983)    |421     |
|Terminator 2: Judgment Day (1991)                    |395     |
+-----------------------------------------------------+--------+



In [14]:
#Worst Movies 
import time
start_time = time.time()
spark.sql("select id, inDegree from INDEGREE ORDER BY inDegree asc limit 5").show(truncate=False)

time.time() - start_time

+-----------------------------------+--------+
|id                                 |inDegree|
+-----------------------------------+--------+
|Bitter Sugar (Azucar Amargo) (1996)|1       |
|X: The Unknown (1956)              |1       |
|In the Bleak Midwinter (1995)      |1       |
|American Dream (1990)              |1       |
|Secret Agent (1936)                |1       |
+-----------------------------------+--------+



0.3879051208496094

In [16]:
graph.vertices.createOrReplaceTempView("Vertices")

In [17]:
#Best Genres for young adults
import time
start_time = time.time()

spark.sql("select Genres, count(distinct(Movie_Title)) AS Watch_Count FROM Vertices WHERE Age<30 And Age>20 GROUP BY Genres ORDER BY Watch_Count desc").show(truncate=False)

time.time() - start_time

+--------------------+-----------+
|Genres              |Watch_Count|
+--------------------+-----------+
|Drama               |502        |
|Comedy              |399        |
|Horror              |135        |
|Comedy|Romance      |122        |
|Comedy|Drama        |121        |
|Drama|Romance       |111        |
|Thriller            |80         |
|Documentary         |54         |
|Action              |53         |
|Drama|Thriller      |52         |
|Children's|Comedy   |44         |
|Action|Thriller     |43         |
|Drama|War           |35         |
|Comedy|Drama|Romance|31         |
|Animation|Children's|31         |
|Crime|Drama         |30         |
|Action|Drama        |30         |
|Horror|Thriller     |30         |
|Comedy|Horror       |30         |
|Action|Sci-Fi       |27         |
+--------------------+-----------+
only showing top 20 rows



0.6950337886810303