# Prepare Dataset

## Load Graph

In [None]:
import random
import csv
import json
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score


In [1]:
import pickle
import rdflib

graph = rdflib.Graph()
serialized_path = "../Dataset/graph.pkl"
with open(serialized_path, 'rb') as f:
    print("Loading serialized graph")
    graph = pickle.load(f)



Loading serialized graph


In [2]:
import rdflib
RDFS = rdflib.namespace.RDFS
uri = 'http://www.wikidata.org/entity/Q457180'

ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
ans = ent2lbl[uri]
print(ans)

KeyError: 'http://www.wikidata.org/entity/Q457180'

In [None]:
# graph

In [None]:
# query_template = '''
#             PREFIX ddis: <http://ddis.ch/atai/>
#             PREFIX wd: <http://www.wikidata.org/entity/>
#             PREFIX wdt: <http://www.wikidata.org/prop/direct/>
#             PREFIX schema: <http://schema.org/>
#             PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            
#             SELECT ?movie ?movieLabel ?predicateLabel ?object ?objectLabel WHERE {{
#                 # Find the movie entity based on an exact match for the label
#                 ?movie rdfs:label "{0}"@en .
                
#                 # Retrieve all predicates and objects related to the movie entity
#                 ?movie ?predicate ?object .

#                 FILTER(?predicate IN (
#                       wdt:P31,   # instance of
#                       wdt:P57,   # director
#                       wdt:P162,  # producer
#                       wdt:P364,  # original language
#                       wdt:P272,  # production company
#                       wdt:P58,   # screenwriter
#                       wdt:P166,  # award received
#                       wdt:P2047, # duration
#                       wdt:P577 # release date
#                   ))

#                 # Optionally retrieve labels for predicates and objects
#                 OPTIONAL {{ ?predicate rdfs:label ?predicateLabel . FILTER(LANG(?predicateLabel) = "en") }}
#                 OPTIONAL {{ ?object rdfs:label ?objectLabel . FILTER(LANG(?objectLabel) = "en") }}
#                 OPTIONAL {{ ?movie rdfs:label ?movieLabel . FILTER(LANG(?movieLabel) = "en") }}
#             }}
#             ORDER BY ?movie
#         '''

# movie_name = "The Godfather"
# query = query_template.format(movie_name)

# result = graph.query(query)
# res = [str(row) for row in result]
# print(res)

## Query All Movies

In [9]:
# Define the SPARQL query
# Define the SPARQL query to select all movie genres from the knowledge graph
query = '''
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?genreLabel WHERE {
  ?movie wdt:P31 wd:Q11424 .
  ?movie wdt:P136 ?genre .
  ?genre rdfs:label ?genreLabel .
  FILTER(LANG(?genreLabel) = "en")
}
'''


# Execute the query
result = graph.query(query)

# Extract movie genres into a list
genres = [str(row.genreLabel) for row in result]
print("Total genres found:", len(genres))
print(genres) 



Total genres found: 357
['LGBT-related film', 'romance film', 'comedy film', 'Western film', 'parody film', 'drama', 'action film', 'thriller film', 'kung fu film', 'war film', 'documentary film', 'gangster film', 'historical film', 'biographical film', 'crime thriller', 'film based on books', 'spy film', 'teen film', 'rape and revenge film', 'independent film', 'crime film', 'comedy-drama', 'horror film', 'fantasy film', 'historical drama', 'family film', 'Christmas film', 'romantic comedy', "children's film", 'black comedy film', 'musical film', 'road movie', 'neo-noir', 'martial arts film', 'girls with guns', 'erotic thriller', 'coming-of-age story', 'hood film', 'heist film', 'monster film', 'film based on literature', 'slasher film', 'buddy cop film', 'buddy film', 'police film', 'political thriller', 'suspense', 'adventure film', 'science fiction film', 'concert film', 'trial film', 'splatter film', 'musical comedy', 'cannibal film', 'found footage', 'psychological thriller', 'se

In [None]:
# with open("../Dataset/MovieTitles", 'wb') as f:
#     pickle.dump(movie_names, f)


### Query all person

In [None]:
# # Define the SPARQL query for extracting persons
# query = '''
# PREFIX wd: <http://www.wikidata.org/entity/>
# PREFIX wdt: <http://www.wikidata.org/prop/direct/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

# SELECT ?personLabel WHERE {
#   ?person wdt:P31 wd:Q5 .  # Q5 represents humans on Wikidata
#   ?person rdfs:label ?personLabel .
#   FILTER(LANG(?personLabel) = "en")
# }
# '''

# # Execute the query and extract person names
# result = graph.query(query)
# person_names = [str(row.personLabel) for row in result]
# print("Total persons found:", len(person_names))


In [None]:
# with open("../Dataset/MovieTitles", 'wb') as f:
#     pickle.dump(person_names, f)

In [None]:
import pickle

# Read movie titles from the text file and save as a pickle file
with open("../Dataset/MovieTitles.txt", 'r') as txt_file:
    # Read the content of the text file
    content = txt_file.read()
    
    # Convert the string representation of a list into a Python list
    movie_titles = eval(content)

# Save the movie titles as a pickle file
with open("../Dataset/MovieTitles.pickle", 'wb') as pickle_file:
    pickle.dump(movie_titles, pickle_file)

# Load movie titles from the pickle file
with open("../Dataset/MovieTitles.pickle", 'rb') as pickle_file:
    movie_titles = pickle.load(pickle_file)

# Convert movie titles to a list (in case it is not already)
movie_titles = list(movie_titles)


In [None]:
templates = [
    # Existing templates
    "When was {MOVIE} released?",
    "Who directed the movie {MOVIE}?",
    "Who directed {MOVIE}?",
    "Who is the director of {MOVIE}?",
    "Did James Cameron direct {MOVIE}?",
    "Does James Cameron direct {MOVIE}?",
    "Did {MOVIE} have the same director as {MOVIE}?",
    "Is {MOVIE} set in the French Renaissance period?",
    "Is {MOVIE} directed by Stanley Kubrick?",
    "Is {MOVIE} a sequel?",
    "Recommend movies similar to {MOVIE} and {MOVIE}.",
    "Recommend movies like {MOVIE}.",
    "Recommend movies like {MOVIE}, {MOVIE}, and {MOVIE}.",
    "Given that I like {MOVIE}, {MOVIE}, and {MOVIE}, recommend some movies.",

    # Additional templates
    "What is the release date of {MOVIE}?",
    "What was {MOVIE} released?",
    "Who played the lead role in {MOVIE}?",
    "Did {MOVIE} win any Academy Awards?",
    "What is the IMDb rating of {MOVIE}?",
    "What is the genre of {MOVIE}?",
    "Who wrote the screenplay for {MOVIE}?",
    "Who is the screenwriter of {MOVIE}?",
    "What is the box office collection of {MOVIE}?",
    "Who was the cinematographer for {MOVIE}?",
    "What is the runtime of {MOVIE}?",
    "Did {MOVIE} feature any award-winning performances?",
    "Who composed the music for {MOVIE}?",
    "What is the main theme of {MOVIE}?",
    "What awards did {MOVIE} receive?",
    "What is the production company of {MOVIE}?",
    "What languages does {MOVIE} support?",
    "Is {MOVIE} based on a true story?",
    "Who is the writer of {MOVIE}?",
    "Is {MOVIE} set in a specific historical period?",
    "Who is the executive producer of {MOVIE}?",
    "I loved {MOVIE} and {MOVIE}. What else should I watch?",
    "What movies will I like if I like {MOVIE}?",
    "Can you recommend movies similar to {MOVIE}?",
    "Given that I like {MOVIE}, {MOVIE}, and {MOVIE}, recommend some movies.",
    "Suggest films with similar vibes to {MOVIE}.",
    "Recommend movies with time travel themes, for example {MOVIE}.",
    "Suggest movies related to {MOVIE}.",
    "What movies are like {MOVIE}?",
    "Any recommendations for animated movies like {MOVIE}?",
    "Recommend movies that are similar to {MOVIE}.",
    "What movies should I watch if I enjoyed {MOVIE}?",
    "Recommend animated movies to watch if I liked {MOVIE}.",
    # Factual questions converted into templates
    "When was {MOVIE} released?",
    "Who is the director of {MOVIE}?",
    "Who directed {MOVIE}?",
    "What is the MPAA film rating of {MOVIE}?",
    "What is the genre of {MOVIE}?",
    "What is the box office of {MOVIE}?",
    "Can you tell me the publication date of {MOVIE}?",
    "Who is the executive producer of {MOVIE}?",
    "Which director is known for {MOVIE}?",
    "Is {MOVIE} set in the French Renaissance period?",
    # Recommendation questions converted into templates
    "Recommend movies similar to {MOVIE} and {MOVIE}.",
    "Given that I like {MOVIE}, {MOVIE}, and {MOVIE}, can you recommend some movies?",
    "Recommend movies like {MOVIE}, {MOVIE}, and {MOVIE}.",
    "What movies will I like if I like {MOVIE}?",
    "Give me movies like {MOVIE}.",
]

additional_templates = [
    # General queries
    "Tell me about {MOVIE}.",
    "What is {MOVIE} about?",
    "Can you provide a summary of {MOVIE}?",
    "Is {MOVIE} worth watching?",
    "What are some reviews of {MOVIE}?",
    # Awards and nominations
    "How many awards has {MOVIE} won?",
    "Was {MOVIE} nominated for any Oscars?",
    "Did {MOVIE} win any Golden Globe awards?",
    # Cast and crew
    "Who starred in {MOVIE}?",
    "Who are the main actors in {MOVIE}?",
    "List the cast of {MOVIE}.",
    "Who produced {MOVIE}?",
    # Sequels and series
    "Is {MOVIE} part of a series?",
    "What is the sequel to {MOVIE}?",
    "What movies are prequels to {MOVIE}?",
    # Release information
    "When did {MOVIE} come out?",
    "What year was {MOVIE} released?",
    # Genre and style
    "Is {MOVIE} a comedy or a drama?",
    "What style of film is {MOVIE}?",
    "Is {MOVIE} a horror movie?",
    # Recommendations based on mood or theme
    "I'm looking for movies like {MOVIE}. Any suggestions?",
    "What should I watch if I enjoyed {MOVIE}?",
    "Movies similar in theme to {MOVIE}?",
    # Box office and ratings
    "How successful was {MOVIE} at the box office?",
    "What ratings did {MOVIE} receive?",
    "Is {MOVIE} critically acclaimed?",
    # Availability
    "Where can I watch {MOVIE}?",
    "Is {MOVIE} available on Netflix?",
    # Personal opinions
    "Do you think {MOVIE} is a good film?",
    "Would you recommend {MOVIE}?",
    # Comparisons
    "Which is better, {MOVIE} or {MOVIE}?",
    "How does {MOVIE} compare to {MOVIE}?",
    # Behind the scenes
    "Are there any interesting facts about {MOVIE}?",
    "Tell me some trivia about {MOVIE}.",
    # Soundtracks
    "Who composed the soundtrack for {MOVIE}?",
    "Is the music in {MOVIE} noteworthy?",
    # Technical details
    "What camera was used to film {MOVIE}?",
    "Was {MOVIE} shot in digital or on film?",
    # Language and subtitles
    "Is {MOVIE} in English?",
    "Does {MOVIE} have subtitles?",
    # Cultural impact
    "How did {MOVIE} influence cinema?",
    "What is the cultural significance of {MOVIE}?",
    # Audience
    "Is {MOVIE} suitable for children?",
    "Can kids watch {MOVIE}?",
    # Plot specifics
    "Does {MOVIE} have a happy ending?",
    "What happens at the end of {MOVIE}?",
    # Release formats
    "Is there a 3D version of {MOVIE}?",
    "Was {MOVIE} released in IMAX?",
    # Miscellaneous
    "Did {MOVIE} face any controversies?",
    "What are the themes explored in {MOVIE}?",
    "Is {MOVIE} based on a book?",
    "Who wrote the original story for {MOVIE}?",
    "Are there any spin-offs from {MOVIE}?",
    "What inspired the creation of {MOVIE}?",
]

templates.extend(additional_templates)


In [None]:
import json
import random
import csv
import re

DATASET_FILE = "movie_title_dataset.json"

def process_template(template, selected_movies):
    # Use regex to split the template by {MOVIE} placeholders
    pattern = re.compile(r'(\{MOVIE\})')
    parts = pattern.split(template)
    sentence_words = []
    labels = []
    movie_idx = 0
    for part in parts:
        if part == "{MOVIE}":
            # Replace with movie title and label it
            movie = selected_movies[movie_idx]
            movie_idx += 1
            movie_words = movie.split()
            sentence_words.extend(movie_words)
            labels.extend(["B-MOVIE"] + ["I-MOVIE"] * (len(movie_words) - 1))
        else:
            # Split the non-placeholder parts into words and label as 'O'
            words = part.split()
            sentence_words.extend(words)
            labels.extend(["O"] * len(words))
    return sentence_words, labels

def generate_dataset(movie_titles, templates, size=60000, output_file=DATASET_FILE):
    dataset = []
    SENTENCE_LEN_THRESHOLD = 100

    # Step 1: Ensure each movie title is included at least once
    for movie in movie_titles:
        # Choose a template with at least one {MOVIE} placeholder
        valid_templates = [t for t in templates if "{MOVIE}" in t]
        template = random.choice(valid_templates)

        # Count the number of {MOVIE} placeholders
        movie_placeholders = template.count("{MOVIE}")

        # Select movies to replace placeholders
        selected_movies = [movie]

        # If more placeholders, fill with random movies excluding the current one
        if movie_placeholders > 1:
            remaining_movies = list(set(movie_titles) - set([movie]))
            additional_movies = random.sample(remaining_movies, min(movie_placeholders - 1, len(remaining_movies)))
            selected_movies.extend(additional_movies)

        # Process the template to get words and labels
        sentence_words, labels = process_template(template, selected_movies)

        if len(sentence_words) > SENTENCE_LEN_THRESHOLD:
            continue

        dataset.append({"sentence": sentence_words, "labels": labels})

    # Step 2: Generate additional sentences to reach the desired size
    while len(dataset) < size:
        template = random.choice(templates)
        movie_placeholders = template.count("{MOVIE}")

        # Randomly select movie titles for each placeholder
        if movie_placeholders > 0:
            selected_movies = random.sample(movie_titles, min(movie_placeholders, len(movie_titles)))
        else:
            selected_movies = []

        # Process the template to get words and labels
        sentence_words, labels = process_template(template, selected_movies)

        if len(sentence_words) > SENTENCE_LEN_THRESHOLD:
            continue

        dataset.append({"sentence": sentence_words, "labels": labels})

    # Save dataset as JSON Lines
    with open(output_file, 'w', encoding='utf-8') as f:
        for example in dataset:
            f.write(json.dumps(example) + '\n')

    # Convert the data to CSV for visual inspection
    with open("movie_title_dataset.csv", mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["sentence", "labels"])
        for example in dataset:
            # Convert lists to space-separated strings
            sentence_str = ' '.join(example['sentence'])
            labels_str = ' '.join(example['labels'])
            writer.writerow([sentence_str, labels_str])

    print(f"Dataset saved to {output_file} and 'movie_title_dataset.csv'")

# Generate the training data



In [None]:
# TRAIN_DATA = generate_dataset(movie_titles, templates)

## Prepare the Dataset for Fine-Tuning

In [None]:
from datasets import load_dataset

# Load dataset from JSON Lines file
dataset = load_dataset('json', data_files=DATASET_FILE, split='train')

# Split the dataset into train, validation, and test sets
train_testvalid = dataset.train_test_split(test_size=0.3, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
eval_dataset = test_valid['train']
test_dataset = test_valid['test']


tokenize the sentences in the dataset using the `BertTokenizer`

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

label_list = ["O", "B-MOVIE", "I-MOVIE"]
label_map = {label: i for i, label in enumerate(label_list)}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])
            else:
                # Set the label for sub-tokens to -100 to ignore during loss computation
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs




Tokenization to all datasets

In [None]:
# Apply the tokenization to all datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Tune the BERT-base-NER

In [None]:
# %pip install -U accelerate
# %pip install -U transformers

Check GPU

In [None]:
import torch

print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display your GPU name, e.g., 'NVIDIA GeForce RTX 4060'


In [None]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

label_list = ["O", "B-MOVIE", "I-MOVIE"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_list[pred] for pred, lab in zip(prediction, label) if lab != -100]
                        for prediction, label in zip(predictions, labels)]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# Load the model
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=300,
    load_best_model_at_end=True,
)

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [label_list[pred] for pred, lab in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



trainer.train()


## Test result

In [None]:
test_results = trainer.evaluate(test_dataset)
print(test_results)

Save the model

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_BERT_base_uncased")
tokenizer.save_pretrained("./fine_tuned_BERT_base_uncased")

print("Model and tokenizer saved to './fine_tuned_BERT_base_uncased'")


## Inference

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_BERT_base_uncased")
model = AutoModelForTokenClassification.from_pretrained("./fine_tuned_BERT_base_uncased")

# Create the pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example inference
sentences = [
    "Is Wildcats directed by Stanley Kubrick?",
    "Who played the lead role in Avatar",
    "Who directed Night Across the Street",
    "When was The Godfather released?",
    "When was 'The Godfather' released?",
    "Who is the screenwriter of The Masked Gang: Cyprus?",
    "Who is the director of Star Wars: Episode VI - Return of the Jedi?",
]

sentences_1 = [
        "Did Christopher Nolan direct Inception?",
    "Is GoldenEye 007 a James Bond movie?",
"Is Following a black and white film?",
"Does the lord of the Rings Trilogy consist of three movies?",
"Does First Man depict the life of Neil Armstrong?",
"Is La Princesse de Clèves set in the French Renaissance period?",
"Is 2001: A Space Odyssey directed by Stanley Kubrick?",
"Is Devil in the Flesh 2 a sequel?",
"Did James Cameron direct Titanic?",
]

for s in sentences_1:
    ner_results = ner_pipeline(s)

    print(f"\nSentence: \"{s}\"")
    if ner_results:
        for entity in ner_results:
            label = entity["entity_group"]
            word = entity["word"]
            score = entity["score"]
            if label in ('LABEL_1', 'LABEL_2'):
                print(f"  - Entity: '{word}', Label: '{label}', Confidence: {score:.2f}")
    else:
        print("No entities found.")


In [None]:
from transformers import pipeline

# Load the fine-tuned model
ner_pipeline = pipeline("ner", model="./Tuned_BERT_NER_movie-60000", tokenizer="./Tuned_BERT_NER_movie-60000", aggregation_strategy="simple", device="cuda")

# Example inference
sentences = [
    "Let's talk about Avatar.",
    "When was The Godfather released?",
    "When was vampire assassin released?",
    "Who is the screenwriter of The Masked Gang: Cyprus?",
    "Who is the director of Star Wars: Episode VI - Return of the Jedi?",
]

sentences_1 = [
        "Did Christopher Nolan direct Inception?",
    "Is GoldenEye 007 a James Bond movie?",
"Is Following a black and white film?",
"Does the lord of the Rings Trilogy consist of three movies?",
"Does First Man depict the life of Neil Armstrong?",
"Is La Princesse de Clèves set in the French Renaissance period?",
"Is 2001: A Space Odyssey directed by Stanley Kubrick?",
"Is Devil in the Flesh 2 a sequel?",
"Did James Cameron direct Titanic?",
]

recommendation_sentence = [
    "Given that I like Inception, The Godfather can you recommend me some movies"
]

for s in sentences:
    ner_results = ner_pipeline(s)

    print(f"\nSentence: \"{s}\"")
    if ner_results:
        for entity in ner_results:
            label = entity["entity_group"]
            word = entity["word"]
            score = entity["score"]
            if label in ('LABEL_1', 'LABEL_2'):
                print(f"  - Entity: '{word}', Label: '{label}', Confidence: {score:.2f}")
    else:
        print("No entities found.")
