In [8]:
from neo4j import GraphDatabase
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import torch

# Check GPU availability
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

Using device: GPU


In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

BartTokenizerFast(name_or_path='facebook/bart-large-mnli', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}
)

In [10]:
df = pd.read_csv('lang.csv')

language_dict = {
    'en': 'English',
    'ja': 'Japanese',
    'fr': 'French',
    'zh': 'Chinese',
    'es': 'Spanish',
    'de': 'German',
    'hi': 'Hindi',
    'ru': 'Russian',
    'ko': 'Korean',
    'te': 'Telugu',
    'cn': 'Chinese',
    'it': 'Italian',
    'nl': 'Dutch',
    'ta': 'Tamil',
    'sv': 'Swedish',
    'th': 'Thai',
    'da': 'Danish',
    'xx': 'Unknown',
    'hu': 'Hungarian',
    'cs': 'Czech',
    'pt': 'Portuguese',
    'is': 'Icelandic',
    'tr': 'Turkish',
    'nb': 'Norwegian Bokmål',
    'af': 'Afrikaans',
    'pl': 'Polish',
    'he': 'Hebrew',
    'ar': 'Arabic',
    'vi': 'Vietnamese',
    'ky': 'Kyrgyz',
    'id': 'Indonesian',
    'ro': 'Romanian',
    'fa': 'Persian',
    'no': 'Norwegian',
    'sl': 'Slovenian',
    'ps': 'Pashto',
    'el': 'Greek'
}

In [11]:
# Load summarizer (optimized for GPU if available)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

# Load zero-shot classifier (for mood classification)
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Connect to Neo4j (replace with your credentials)
uri = "neo4j+s://0d57704b.databases.neo4j.io"
driver = GraphDatabase.driver(uri, auth=("neo4j", "5yPsvhzqDCZYx2s08eS3GvGLPM33v32IaQp-jEG3CdM"))

Device set to use cuda:0


In [12]:
# def summarize_text(text):
#     input_length = len(text.split())
#     max_len = min(50, input_length)  # avoid warning
#     min_len = min(25, max_len-1) if max_len > 25 else 5
#     summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
#     return summary[0]['summary_text']

# def classify_director(text):
#     return text

def create_graph(tx, movie_data):
    # Get full language name from the dictionary
    language_name = language_dict.get(movie_data['language'], 'Unknown')
    
    # Create the Cypher query
    tx.run("""
        // Create Language node
        MERGE (l:Language {name: $language_name, code: $language_code})
        
        // Create Movie node with all properties
        MERGE (m:Movie {title: $title})
        SET m.year = $year,
            m.runtime = $runtime,
            m.vote_count = $vote_count,
            m.revenue = $revenue,
            m.overview = $overview,
            m.genres = $genres
        
        // Create Director node
        MERGE (d:Director {name: $director})
        
        // Create relationships
        MERGE (l)-[:IN_LANGUAGE]->(m)
        MERGE (d)-[:DIRECTED]->(m)
        """, 
        language_name=language_name,
        language_code=movie_data['language'],
        title=movie_data['title'],
        year=movie_data['year'],
        runtime=movie_data['runtime'],
        vote_count=movie_data['vote_count'],
        revenue=movie_data['revenue'],
        overview=movie_data['overview'],
        genres=movie_data['genres'],
        director=movie_data['director']
    )


In [13]:
# Process the dataset
with driver.session() as session:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        movie_data = {
            'title': row['original_title'],
            'language': row['original_language'],
            'year': pd.to_datetime(row['release_date']).year,
            'runtime': int(row['runtime']) if pd.notna(row['runtime']) else 0,
            'vote_count': int(row['vote_count']) if pd.notna(row['vote_count']) else 0,
            'revenue': float(row['revenue']) if pd.notna(row['revenue']) else 0.0,
            'overview': str(row['overview']) if pd.notna(row['overview']) else '',
            'genres': str(row['genres']) if pd.notna(row['genres']) else '',
            'director': str(row['director']) if pd.notna(row['director']) else 'Unknown'
        }
        
        try:
            session.execute_write(create_graph, movie_data)
        except Exception as e:
            print(f"Error processing {movie_data['title']}: {e}")

driver.close()
print("✅ Movie knowledge graph built successfully!")

100%|██████████| 4803/4803 [24:18<00:00,  3.29it/s]

✅ Movie knowledge graph built successfully!





In [14]:
with driver.session() as session:
    result = session.run("""
        MATCH (l:Language)-[r:IN_LANGUAGE]->(m:Movie)
        RETURN l.name as Language, count(m) as MovieCount
        ORDER BY MovieCount DESC
        LIMIT 5
    """)
    for record in result:
        print(f"{record['Language']}: {record['MovieCount']} movies")

  with driver.session() as session:


English: 4503 movies
French: 70 movies
Chinese: 39 movies
Spanish: 32 movies
German: 27 movies
