In [None]:
import pandas as pd

# Load the dataset into a Pandas DataFrame
df = pd.read_csv("/Users/zhangmanlin/dataset_director.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,264,tt1232829,21 Jump Street,"Action, Comedy, Crime",2012,7.2,5.0,1543391407,Channing Tatum,0.988,"Phil Lord, Christopher Miller"
1,647,tt2965466,Last Shift,"Horror, Mystery",2014,5.8,3.0,1551382166,satanism,0.9925,Anthony DiBlasi
2,653,tt1568911,War Horse,"Adventure, Drama, War",2011,7.2,3.5,1538936562,animals - live action,0.992,Steven Spielberg
3,2165,tt1961175,American Assassin,"Action, Thriller",2017,6.2,1.0,1525740032,propaganda,0.80725,Michael Cuesta
4,2691,tt1232829,21 Jump Street,"Action, Comedy, Crime",2012,7.2,4.5,1442367427,meta,0.988,"Phil Lord, Christopher Miller"


After you run the datacleaning.sql. Load the data by putting the csv file in your Neo4j import file folder.

### **Neo4j Cypher Query for Importing Data**
```cypher
MERGE (u:User {userId: $userId})

MERGE (m:Movie {imdbId: $imdb_id})
ON CREATE SET 
    m.title = $title,
    m.releaseYear = toInteger($releaseYear),
    m.averageRating = toFloat($averageRating)

MERGE (u)-[r:RATED]->(m)
SET r.rating = toFloat($rating), 
    r.timestamp = toInteger($timestamp)

// Handle Genres
WITH m, split($genres, '|') AS genres
UNWIND genres AS genre_name
MERGE (g:Genre {name: trim(genre_name)})
MERGE (m)-[:OF_GENRE]->(g)

// Handle Director(s)
WITH m, split($directors, ',') AS directors
UNWIND directors AS director_name
MERGE (d:Director {name: trim(director_name)})
MERGE (m)-[:DIRECTED_BY]->(d)

// Handle Tags (if available)
WITH m
WHERE $tag IS NOT NULL
MERGE (t:Tag {name: trim($tag)})
MERGE (m)-[:HAS_TAG]->(t)

// Handle Keywords (if relevance >= 0.5)
WITH m
WHERE $tag IS NOT NULL AND $relevance IS NOT NULL AND toFloat($relevance) >= 0.5
MERGE (k:Keyword {name: trim($tag)})
MERGE (m)-[:RELATED_TO {relevance: toFloat($relevance)}]->(k);

### **Neo4j Index Creation for Performance Optimization**
```cypher
CREATE INDEX movie_imdbId IF NOT EXISTS FOR (m:Movie) ON (m.imdbId);
CREATE INDEX director_name IF NOT EXISTS FOR (d:Director) ON (d.name);
CREATE INDEX user_userId IF NOT EXISTS FOR (u:User) ON (u.userId);
CREATE INDEX genre_name IF NOT EXISTS FOR (g:Genre) ON (g.name);
CREATE INDEX tag_name IF NOT EXISTS FOR (t:Tag) ON (t.name);
CREATE INDEX keyword_name IF NOT EXISTS FOR (k:Keyword) ON (k.name);

In [7]:
from neo4j import GraphDatabase

# Define Neo4j connection parameters
NEO4J_URI = "neo4j://localhost:7687"  # Update if needed
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "zml+1908"

# Create a connection to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))


In [8]:
def run_query(query, params=None):
    with driver.session() as session:
        result = session.run(query, parameters=params)
        return pd.DataFrame([dict(record) for record in result])

In [None]:
recommendation_query = """
MATCH (u:User {userId: $userId})-[r1:RATED]->(m:Movie)
WHERE r1.rating >= 4.0

MATCH (u2:User)-[r2:RATED]->(m)
WHERE u2 <> u AND r2.rating >= 4.0

WITH u, u2, COUNT(DISTINCT m) AS similarity
ORDER BY similarity DESC
LIMIT 5 // Top 5 similar users

MATCH (u2)-[r3:RATED]->(candidate:Movie)
WHERE r3.rating >= 4.0 AND NOT EXISTS((u)-[:RATED]->(candidate))

WITH DISTINCT u, candidate

OPTIONAL MATCH (u)-[:RATED|TAGGED]->(likedMovie:Movie)
WHERE EXISTS((u)-[:RATED {rating: 4.0}]->(likedMovie)) OR EXISTS((u)-[:TAGGED]->(likedMovie))

MATCH (likedMovie)-[:OF_GENRE|DIRECTED_BY|HAS_TAG|RELATED_TO]->(commonEntity)
      <-[:OF_GENRE|DIRECTED_BY|HAS_TAG|RELATED_TO]-(candidate)

WITH candidate, COUNT(DISTINCT commonEntity) AS contentScore, 
     COLLECT(DISTINCT commonEntity.name) AS matchedContent

OPTIONAL MATCH (u)-[:INTERESTED_IN]->(interest:Keyword)<-[:RELATED_TO]-(candidate)
WITH candidate, contentScore, matchedContent,
     COUNT(DISTINCT interest) AS interestScore,
     COLLECT(DISTINCT interest.name) AS matchedInterests

WITH candidate, matchedContent, matchedInterests,
     (contentScore + interestScore) AS finalRecommendationScore

RETURN candidate.title AS RecommendedMovie,
       matchedContent AS MatchedContent,
       matchedInterests AS MatchedInterests,
       finalRecommendationScore AS TotalScore
ORDER BY TotalScore DESC
LIMIT 10;
"""

# Run query with a specific userId
user_id = "6038"
recommendations_df = run_query(recommendation_query, {"userId": user_id})
recommendations_df

      RecommendedMovie                                     MatchedContent  \
0  Slumdog Millionaire  [social commentary, violence, emotional, roman...   
1                   Up  [funny, rainy day watchlist, comedy, emotional...   
2              Arrival  [slow, boring, cinematography, thought-provoki...   
3   Mad Max: Fury Road  [surreal, special effects, great soundtrack, c...   
4   The Social Network  [funny, witty, soundtrack, friendship, adapted...   
5           Prometheus  [Watched, Michael Fassbender, philosophical, h...   
6     Moonrise Kingdom  [funny, slow, surreal, fantasy, small town, ro...   
7          Pacific Rim  [romance, silly, cinematography, ending, sci-f...   
8   500 Days of Summer  [Funny, slow, romance, great soundtrack, quirk...   
9   The Imitation Game  [romance, boring, blu-ray, England, history, L...   

                                    MatchedInterests  TotalScore  
0  [music, romance, predictable, cinematography, ...         139  
1  [predictable, f

In [11]:
underrated_query = """
MATCH (u:User {userId: $userId})-[r:RATED]->(m:Movie)

// Convert 10-scale rating explicitly to 5-scale
WITH m, r, (m.averageRating / 2.0) AS convertedAvgRating

// Explicitly round converted rating to nearest 0.5 increment
WITH m, r, 
     round(convertedAvgRating * 2.0) / 2.0 AS roundedAvgRating

// Find movies rated significantly higher by user (≥1.0 point above rounded avg)
WHERE (r.rating - roundedAvgRating) >= 1.0

RETURN m.title AS Movie,
       roundedAvgRating AS RoundedAverageRating,
       r.rating AS UserRating,
       (r.rating - roundedAvgRating) AS RatingDifference
ORDER BY RatingDifference DESC;
"""

# Run query with a specific userId
user_id = "264"
underrated_df = run_query(underrated_query, {"userId": user_id})

underrated_df

Unnamed: 0,Movie,RoundedAverageRating,UserRating,RatingDifference
0,They Came Together,3.0,5.0,2.0
1,Get Hard,3.0,5.0,2.0
2,21 Jump Street,3.5,5.0,1.5
3,That's My Boy,3.0,4.5,1.5
4,Unfinished Business,2.5,4.0,1.5
5,The Campaign,3.0,4.5,1.5
6,Bad Teacher,3.0,4.0,1.0
7,The Interview,3.5,4.5,1.0


In [16]:
oscar_query = """
// Step 1: Calculate content-based match score
MATCH (u:User {userId: $userId})-[r:RATED]->(m:Movie)
WHERE r.rating >= 4.0

MATCH (m)-[:OF_GENRE|DIRECTED_BY|HAS_TAG|RELATED_TO]->(commonEntity)
      <-[:OF_GENRE|DIRECTED_BY|HAS_TAG|RELATED_TO]-(recMovie:Movie)
WHERE NOT EXISTS((u)-[:RATED]->(recMovie))

WITH recMovie, COUNT(DISTINCT commonEntity) AS matchScore, 
     COLLECT(DISTINCT commonEntity.name) AS matchedEntities

// Step 2: Calculate interest-based match score
OPTIONAL MATCH (u)-[:INTERESTED_IN]->(interest:Keyword)<-[:RELATED_TO]-(recMovie)
WITH recMovie, matchScore, matchedEntities,
     COUNT(DISTINCT interest) AS interestScore,
     COLLECT(DISTINCT interest.name) AS matchedKeywords

// Step 3: Check for Oscar-related tags
OPTIONAL MATCH (recMovie)-[:HAS_TAG]->(oscarTag:Tag)
WHERE oscarTag.name CONTAINS 'Oscar' OR oscarTag.name CONTAINS 'Nominee' OR oscarTag.name CONTAINS 'Best Picture'

// Step 4: Assign Oscar bonus points
WITH recMovie, matchScore, matchedEntities, interestScore, matchedKeywords,
     CASE WHEN oscarTag IS NOT NULL THEN 10 ELSE 0 END AS oscarBonus

// Step 5: Calculate final recommendation score
WITH recMovie, matchedEntities, matchedKeywords,
     (matchScore + interestScore + oscarBonus) AS totalScore

RETURN recMovie.title AS RecommendedMovie,
       matchedEntities AS MatchedEntities,
       matchedKeywords AS MatchedKeywords,
       totalScore AS RecommendationScore
ORDER BY RecommendationScore DESC
LIMIT 10;
"""

# Run the corrected query
user_id = "6038"  # Replace with the target user's ID
oscar_df = run_query(oscar_query, {"userId": user_id})

oscar_df

Unnamed: 0,RecommendedMovie,MatchedEntities,MatchedKeywords,RecommendationScore
0,Arrival,"[touching, slow, boring, plot twist, plot hole...","[predictable, cinematography, obvious, overrat...",131
1,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
2,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
3,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
4,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
5,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
6,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
7,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
8,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
9,Slumdog Millionaire,"[romance, heartwarming, great soundtrack, unre...","[music, romance, predictable, cinematography, ...",121
