# Clustering of diffrent nodes 

We used the power of Neo4J do cluster a selected range of diffrent features in the dataset

In [1]:
#Imports
from neo4j import GraphDatabase, basic_auth
from py2neo import Graph
import neo4jupyter
import pandas as pd
import sys  
sys.path.insert(0, '')
from vis_class import vis_class

In [2]:
#Create db connector
login = open("Login.txt", "r")
uri = login.readline()
user = login.readline()
password = ""
login.close()

driver = GraphDatabase.driver(uri, auth=basic_auth(user, password))
session = driver.session(database="neo4j")
visualizer = vis_class()

<IPython.core.display.Javascript object>

Vis class was created


## Starting clustering

We start simple and cluster each character each movie they played in. It the name and a list with their movies will be returned

In [3]:
cluster_each = """MATCH (p:Person)-[:APPEARS_IN]->(m:Movie) 
                Return DISTINCT p.name AS `Name character`, collect(m.name) As Movies"""
df = pd.DataFrame (session.read_transaction(
                    lambda tx: tx.run(cluster_each).data()), columns = ['Name character','Movies'])
df.head()

Unnamed: 0,Name character,Movies
0,JAR JAR,"[Episode I: The Phantom Menace, Episode II: At..."
1,BRAVO THREE,[Episode I: The Phantom Menace]
2,FODE/BEED,[Episode I: The Phantom Menace]
3,GREEDO,"[Episode I: The Phantom Menace, Episode IV: A ..."
4,OBI-WAN,"[Episode I: The Phantom Menace, Episode II: At..."


In [4]:
cluster_each = """MATCH (p:Person)-[a:APPEARS_IN]->(m:Movie) 
                Return DISTINCT p AS source_node, 
                                id(p) AS source_id,
                                a,
                                m AS target_node,
                                id(m) As target_id 
                LIMIT 20"""

In [5]:
vis_class.drawGraph(session, {'Person':'name', 'Movie':'name'}, cluster_each)

Here we get the number of characters in each movie and cluster them if they have more then 20 characters

In [6]:
cluster_by_count = """MATCH pattern=((m:Movie)-[a:APPEARS_IN]-(p:Person)) 
                    with count(pattern) as connections, m
                    Where connections > 20
                    Return m.name AS Movie, connections As Characters"""
df = pd.DataFrame (session.read_transaction(
                            lambda tx: tx.run(cluster_by_count).data()), columns = ['Movie','Characters'])

df.head()

Unnamed: 0,Movie,Characters
0,Episode I: The Phantom Menace,36
1,Episode II: Attack of the Clones,31
2,Episode III: Revenge of the Sith,23
3,Episode VII: The Force Awakens,24


We group the data in three diffrent groups depending on the count of characters

In [7]:
#With cases 
cluster_by_count_categories = """MATCH pattern=((m:Movie)-[a:APPEARS_IN]-(p:Person)) 
                                with count(pattern=((m:Movie)-[a:APPEARS_IN]-(p:Person))) as connections, m
                                Return 
                                CASE
                                  WHEN connections <= 20 THEN "Group 1: " + m.name
                                  WHEN connections <= 30      THEN "Groupe 2: " + m.name
                                  ELSE "Group 3: " + m.name
                                END AS Clusters
                                Order by connections"""
df = pd.DataFrame (session.read_transaction(
                            lambda tx: tx.run(cluster_by_count_categories).data()), columns = ['Clusters'])

df.head()

Unnamed: 0,Clusters
0,Group 1: Episode VI: Return of the Jedi
1,Group 1: Episode V: The Empire Strikes Back
2,Group 1: Episode IV: A New Hope
3,Groupe 2: Episode III: Revenge of the Sith
4,Groupe 2: Episode VII: The Force Awakens


We cluster characters which interact with other characters in two diffrent movies

In [8]:
cluster_by_characters_interact_in_diffrent_movies = """MATCH (p1:Person)-[a1:APPEARS_IN]-(m1:Movie)-[a2:APPEARS_IN]-(p2:Person)-[a3:APPEARS_IN]-(m2:Movie)
                                WHERE m1.name <> m2.name
                                RETURN p2.name as Characters, p1.name as CharacterConnected, m1.name as ViaMovie"""
df = pd.DataFrame (session.read_transaction(
                            lambda tx: tx.run(cluster_by_characters_interact_in_diffrent_movies).data()), columns = ['Characters', 'CharacterConnected', 'ViaMovie'])

df.head()

Unnamed: 0,Characters,CharacterConnected,ViaMovie
0,JAR JAR,MACE WINDU,Episode III: Revenge of the Sith
1,JAR JAR,ODD BALL,Episode III: Revenge of the Sith
2,JAR JAR,DARTH VADER,Episode III: Revenge of the Sith
3,JAR JAR,PLO KOON,Episode III: Revenge of the Sith
4,JAR JAR,GIDDEAN DANU,Episode III: Revenge of the Sith


In [9]:
cluster_by_characters_interact_in_diffrent_movies = """MATCH (p1:Person)-[a1:APPEARS_IN]-(m1:Movie)-[a2:APPEARS_IN]-(p2:Person)-[a3:APPEARS_IN]-(m2:Movie)
                                WHERE m1.name <> m2.name
                                RETURN p1 AS source_node, 
                                id(p1) AS source_id,
                                a1,
                                p2 AS target_node,
                                id(p2) As target_id 
                                LIMIT 20"""

In [10]:
vis_class.drawGraph(session, {'Person':'name', 'target_node':'connected'}, cluster_each)

![title](img/ConnectedWithOthers.png)

## Close the connection

In [11]:
#Close all connections
driver.close()
session.close()
drive = None
session = None