In [28]:
import pandas as pd
from tqdm import tqdm
import os
import ast
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import torch
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [53]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


**Loading The Data**

In [89]:
current_directory = os.getcwd()
file_path = os.path.join(current_directory,"data", "raw", "final_movie_data.csv")

try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"File not found: {file_path}")

In [90]:
## Converting cast and genres columns to list
df['cast'] = df['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) else [])
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) else [])


In [91]:
## Combining text fields for NER
df['text_for_ner'] = df['overview'] + ' ' + df['genres'].apply(lambda x: ' '.join(x)) + ' ' + df['cast'].apply(lambda x: ' '.join(x))

**Defining the Dataset**

In [83]:
#class MovieDataset(Dataset):
#    def __init__(self, texts, tokenizer, max_length=128):
#        self.texts = texts
#        self.tokenizer = tokenizer
#        self.max_length = max_length
        
        
#    def __len__(self):
#        return len(self.texts)
    
    
#    def __getitem__(self, idx):
#        text = self.texts[idx]
        
#        encoding = self.tokenizer(
#            text,
#            return_tensors="pt",
#            max_length=self.max_length,
#            truncation=True,
#            padding="max_length"
#        )
#        return encoding

**Model Setup**

In [92]:
# Pre-trained Model
model_name = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**NER Pipeine**

In [93]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


**Batch Processing**

In [107]:
def extract_entities(text):
    # Check if the text is valid (not NaN and not just whitespace)
    if pd.notna(text) and not (isinstance(text, str) and text.isspace()): 
        try:
            entities = ner_pipeline(text)
            return {ent['entity_group']: ent['word'] for ent in entities}
        except Exception as e:
            print(f"Error processing text: {e}")
            return {}  # Return an empty dictionary in case of an error
    else:
        return {} 

In [109]:
example_text = df['text_for_ner'].iloc[7]
print(extract_entities(example_text))

{'MISC': 'Animation Science Fiction Action', 'ORG': 'Justice League', 'PER': '##ke Amadi'}


In [111]:
df['extracted_entities'] = df['text_for_ner'].apply(extract_entities)

directory = "E:\\Git Uploads\\CineMatch\\notebook\\data\\processed"
df.to_csv(os.path.join(directory, "movies_with_entities.csv"), index=False)

print("NER completed and data saved!")

NER completed and data saved!
