In [9]:
from neo4j import GraphDatabase
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import torch

# Check GPU availability
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

Using device: CPU


In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


BartTokenizerFast(name_or_path='facebook/bart-large-mnli', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}
)

In [12]:
# Load summarizer (optimized for GPU if available)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

# Load zero-shot classifier (for mood classification)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Candidate moods (expand as needed)
candidate_moods = ["Happy", "Sad", "Thrilling", "Romantic", "Adventurous", "Dark", "Inspiring"]

# Connect to Neo4j (replace with your credentials)
uri = "neo4j+s://fea8a723.databases.neo4j.io"
driver = GraphDatabase.driver(uri, auth=("neo4j", "Qp3U5o9HjMkHPuLjj9M4vL91doNcq3Hj4fGFpZV7-XI"))

Device set to use cpu
Device set to use cpu


In [13]:
def summarize_text(text):
    input_length = len(text.split())
    max_len = min(50, input_length)  # avoid warning
    min_len = min(25, max_len-1) if max_len > 25 else 5
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
    return summary[0]['summary_text']

def classify_mood(text):
    result = classifier(text, candidate_moods)
    return result['labels'][0]

def create_graph(tx, title, director, year, rating, mood):
    tx.run("""
        MERGE (mo:Mood {name: $mood})
        MERGE (m:Movie {title: $title})
        SET m.director = $director, m.year = $year, m.rating = $rating
        MERGE (mo)-[:RECOMMENDS]->(m)
        """, mood=mood, title=title, director=director, year=year, rating=rating)

# Load dataset
df = pd.read_csv("eng.csv")


In [None]:
# Process movies with progress bar
with driver.session() as session:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        title = row['Title']
        director = row['Director']
        year = int(row['Year'])
        rating = float(row['Rating'])
        description = str(row['Description'])

        if pd.isna(description) or description.strip() == "":
            continue

        try:
            summary = summarize_text(description)
            mood = classify_mood(summary)

            session.execute_write(create_graph, title, director, year, rating, mood)
        except Exception as e:
            print(f"Error processing {title}: {e}")

driver.close()
print("✅ Knowledge graph built successfully!")

  session.write_transaction(create_graph, title, director, year, rating, mood)
 96%|█████████▌| 960/1000 [1:08:46<05:32,  8.32s/it]

Error processing Lucky Number Slevin: Cannot resolve address si-fea8a723-5c71.production-orch-0068.neo4j.io:7687


 96%|█████████▌| 961/1000 [1:09:00<06:36, 10.17s/it]

Error processing Trance: Cannot resolve address si-fea8a723-5c71.production-orch-0068.neo4j.io:7687


 96%|█████████▌| 962/1000 [1:09:04<05:07,  8.09s/it]

Error processing Into the Forest: Cannot resolve address fea8a723.databases.neo4j.io:7687


 96%|█████████▋| 963/1000 [1:09:06<03:55,  6.36s/it]

Error processing The Other Boleyn Girl: Cannot resolve address fea8a723.databases.neo4j.io:7687


100%|██████████| 1000/1000 [1:11:41<00:00,  4.30s/it]

✅ Knowledge graph built successfully!



