This Notebook is to extract potential props and actors in each scene.

#### Get Prop in Scene Text

##### Fine Tune Classification Model

In [None]:
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import torch

# Manually labeled prop data
# 1: prop, 0: not a prop
df = pd.read_csv("prop_data.csv")

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and evaluation sets
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize the texts
def tokenize_function(examples):
    return tokenizer(examples['object'], truncation=True, padding='max_length', max_length=32)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Remove the text column and format the datasets
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['object'])
tokenized_train_dataset = tokenized_train_dataset.with_format("torch")
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(['object'])
tokenized_eval_dataset = tokenized_eval_dataset.with_format("torch")

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./fine_tuned_classification_model")
tokenizer.save_pretrained("./fine_tuned_classification_model")

##### Load Scenes Data

In [2]:
import pandas as pd
df = pd.read_csv("scenes.csv")

##### Get Props from Objects Identified by Pre-trained NER model in Spacy

In [3]:
import spacy
from spacy import displacy

# Load the pre-trained model
nlp = spacy.load("en_core_web_sm")

In [None]:
df['prop'] = ''
df['num_prop'] = 0

for i in range(len(df)):
    text = df.loc[i, 'text']
    doc = nlp(text)
    objects = []
    for chunk in doc.noun_chunks:
        objects.append(chunk.text)

    # Tokenize the new objects
    objects_encodings = tokenizer(objects, truncation=True, padding='max_length', max_length=32, return_tensors="pt")

    # Get predictions
    with torch.no_grad():
        outputs = model(**objects_encodings)
        predictions = torch.argmax(outputs.logits, dim=-1)

    # Convert predictions to labels
    predicted_labels = predictions.numpy()

    for obj, label in zip(objects, predicted_labels):
        if label == 1:
            df.loc[i, 'prop'] += (obj + ',')
            df.loc[i, 'num_prop'] += 1

#### Get Person in Scene Text

In [None]:
# NER model in Flair
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")

In [None]:
df['person'] = ''
df['num_person'] = 0

for i in range(len(df)):
    sentence = Sentence(df.loc[i, 'text'])
    tagger.predict(sentence)
    person = []
    names = ''
    for entity in sentence.get_spans('ner'):
        if entity.tag == "PERSON":
            name = entity.text.upper()
            if name not in person:
                person.append(name)
                names += (name + ',')
    df.loc[i, 'person'] = names
    df.loc[i, 'num_person'] = len(person)

#### Output

In [None]:
df.to_csv('scenes_metadata.csv', index=False)