In [29]:
import pandas as pd
from tqdm import tqdm
import os
import ast
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import torch
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [30]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


**Loading The Data**

In [18]:
current_directory = os.getcwd()
file_path = os.path.join(current_directory,"data", "raw", "final_movie_data.csv")

try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"File not found: {file_path}")

In [19]:
df.head()

Unnamed: 0,title,overview,release_date,genres,runtime,rating,cast
0,Deadpool & Wolverine,A listless Wade Wilson toils away in civilian ...,2024-07-24,"['Action', 'Comedy', 'Science Fiction']",128,7.982,"['Ryan Reynolds', 'Hugh Jackman', 'Emma Corrin..."
1,Bad Boys: Ride or Die,"After their late former Captain is framed, Low...",2024-06-05,"['Action', 'Crime', 'Thriller', 'Comedy']",115,7.67,"['Will Smith', 'Martin Lawrence', 'Vanessa Hud..."
2,Inside Out 2,Teenager Riley's mind headquarters is undergoi...,2024-06-11,"['Animation', 'Family', 'Adventure', 'Comedy']",97,7.637,"[""Lupita Nyong'o"", 'Joseph Quinn', 'Alex Wolff..."
3,Despicable Me 4,"Gru and Lucy and their girls — Margo, Edith an...",2024-06-20,"['Animation', 'Family', 'Comedy', 'Action']",94,7.212,"['Amy Poehler', 'Maya Hawke', 'Kensington Tall..."
4,A Quiet Place: Day One,As New York City is invaded by alien creatures...,2024-06-26,"['Horror', 'Science Fiction', 'Thriller']",99,7.017,"['Steve Carell', 'Kristen Wiig', 'Joey King', ..."


In [124]:
df.isna().sum()

title                  0
overview              25
release_date           7
genres                 0
runtime                0
rating                 0
cast                   0
text_for_ner           0
extracted_entities     0
dtype: int64

In [20]:
## Converting cast and genres columns to list
df['cast'] = df['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])


In [100]:
df['cast'][0]

['Ryan Reynolds',
 'Hugh Jackman',
 'Emma Corrin',
 'Matthew Macfadyen',
 'Dafne Keen']

In [101]:
df['genres'].head()

0         [Action, Comedy, Science Fiction]
1         [Action, Crime, Thriller, Comedy]
2    [Animation, Family, Adventure, Comedy]
3       [Animation, Family, Comedy, Action]
4       [Horror, Science Fiction, Thriller]
Name: genres, dtype: object

In [108]:
## Combining text fields for NER
#df['text_for_ner'] = df['genres'].apply(lambda x: ' '.join(x)) + ' ' + df['cast'].apply(lambda x: ' '.join(x))

**Defining the Dataset**

In [83]:
#class MovieDataset(Dataset):
#    def __init__(self, texts, tokenizer, max_length=128):
#        self.texts = texts
#        self.tokenizer = tokenizer
#        self.max_length = max_length
        
        
#    def __len__(self):
#        return len(self.texts)
    
    
#    def __getitem__(self, idx):
#        text = self.texts[idx]
        
#        encoding = self.tokenizer(
#            text,
#            return_tensors="pt",
#            max_length=self.max_length,
#            truncation=True,
#            padding="max_length"
#        )
#        return encoding

**Model Setup**

In [103]:
# Pre-trained Model
model_name = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**NER Pipeline**

In [104]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


**NER Inference**

In [105]:
def extract_entities(genres, cast, title):
    extracted_entities = {'PER': [], 'GEN': [], 'MOV': []} 

    # Map genres to 'GEN'
    for genre in genres:
        extracted_entities['GEN'].append(genre)

    # Map cast members to 'PER'
    for actor in cast:
        extracted_entities['PER'].append(actor)

    # Add movie title to 'MOV'
    extracted_entities['MOV'].append(title)

    return extracted_entities

**Apply NER to DataFrame**

In [106]:
df['extracted_entities'] = df.apply(lambda row: extract_entities(row['genres'], row['cast'], row['title']), axis=1)

In [107]:
df['extracted_entities'][4]

{'PER': ['Steve Carell',
  'Kristen Wiig',
  'Joey King',
  'Will Ferrell',
  'Sofía Vergara'],
 'GEN': ['Horror', 'Science Fiction', 'Thriller'],
 'MOV': ['A Quiet Place: Day One']}

In [121]:
df.sample()

Unnamed: 0,title,overview,release_date,genres,runtime,rating,cast,text_for_ner,extracted_entities
4025,The Philadelphia Experiment,A secret government research project tries rev...,2012-07-27,"[Adventure, Science Fiction, Thriller, Mystery]",85,4.863,"[Otto Fahlgren, Alexandra Gjerpen, Mads Sjøgår...",Adventure Science Fiction Thriller Mystery Ott...,"{'PER': ['Otto Fahlgren', 'Alexandra Gjerpen',..."


In [122]:
directory = "E:\\Git Uploads\\CineMatch\\notebook\\data\\processed"
df.to_csv(os.path.join(directory, "movies_with_entities.csv"), index=False)

print("NER completed and data saved!")

NER completed and data saved!


## Feature Engineering

**OHE For Genres**

In [109]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(df['genres'])
genre_df = pd.DataFrame(genre_features, columns=mlb.classes_)

In [110]:
genre_df.sample()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
1245,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


**Encoding for Cast**

In [111]:
mlb_cast = MultiLabelBinarizer()
cast_features = mlb_cast.fit_transform(df['cast'])
cast_df = pd.DataFrame(cast_features, columns=mlb_cast.classes_)

In [113]:
cast_df.sample()

Unnamed: 0,'Weird Al' Yankovic,50 Cent,A.J. Cook,A.J. Cutler,A.J. Del Cueto,A.J. Langer,A.J. Lister,AJ Raval,Aaliyah,Aamir Khan,...,刘校妤,拉宏,李尚恩,楊奉琛,楊英風,神林恆道,蕭瓊瑞,釋寬謙,권성혁,유호한
1074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Combining Features**

In [114]:
features_matrix = pd.concat([genre_df, cast_df], axis=1)

In [118]:
df['release_date'] = pd.to_datetime(df['release_date'])

df['release_date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5000 entries, 0 to 4999
Series name: release_date
Non-Null Count  Dtype         
--------------  -----         
4993 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 39.2 KB


In [119]:
features_matrix['release_year'] = df['release_date'].dt.year 
features_matrix['rating'] = df['rating']

In [126]:
features_matrix.isna().sum()

Action          0
Adventure       0
Animation       0
Comedy          0
Crime           0
               ..
釋寬謙             0
권성혁             0
유호한             0
release_year    7
rating          0
Length: 12219, dtype: int64

In [128]:
features_matrix.dropna(inplace=True)

**Metrics Calculation**

In [129]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(features_matrix, features_matrix)

**Content Based Filtering**

In [130]:
def get_recommendations(title, cosine_sim=cosine_sim, top_n=10):
    """
    Recommends movies based on content similarity (genres and cast).

    Args:
        title: Title of the movie for which to generate recommendations.
        cosine_sim: The cosine similarity matrix.
        top_n: Number of recommendations to return.

    Returns:
        A list of recommended movie titles.
    """

    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top 'top_n' most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n + 1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 'top_n' most similar movies
    return df['title'].iloc[movie_indices]


Test on Movies

In [131]:
movie_titles_to_test = ["Deadpool & Wolverine", "Inside Out 2", "The Garfield Movie"]

# Get Recommendations for each movie
for title in movie_titles_to_test:
    recommendations = get_recommendations(title, cosine_sim, top_n=5)
    print(f"\nRecommendations for {title}:")
    print(recommendations.to_markdown(numalign="left", stralign="left"))


Recommendations for Deadpool & Wolverine:
|      | title                |
|:-----|:---------------------|
| 4445 | The Sentinel         |
| 889  | Snowpiercer          |
| 77   | Godzilla Minus One   |
| 254  | Sonic the Hedgehog 2 |
| 4842 | Five Blind Dates     |

Recommendations for Inside Out 2:
|      | title                |
|:-----|:---------------------|
| 254  | Sonic the Hedgehog 2 |
| 852  | My Spy               |
| 2187 | Les Misérables       |
| 2931 | Last Looks           |
| 718  | Gods of Egypt        |

Recommendations for The Garfield Movie:
|      | title                                             |
|:-----|:--------------------------------------------------|
| 254  | Sonic the Hedgehog 2                              |
| 2187 | Les Misérables                                    |
| 943  | Nowhere                                           |
| 852  | My Spy                                            |
| 4408 | Naruto the Movie: Ninja Clash in the Land of Snow |


**TFIDF Weighting for Cast**

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
cast_tfidf = vectorizer.fit_transform(df['cast'].astype(str))

cast_tfidf_df = pd.DataFrame(cast_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

In [134]:
cast_tfidf_df.sample()

Unnamed: 0,50,aaliyah,aamir,aarif,aaron,aarons,aaryan,aarón,aavi,aayam,...,刘校妤,拉宏,李尚恩,楊奉琛,楊英風,神林恆道,蕭瓊瑞,釋寬謙,권성혁,유호한
1879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Combining Features**

In [135]:
feature_matrix = pd.concat([genre_df, cast_tfidf_df], axis=1) 

In [136]:
cosine_sim = cosine_similarity(features_matrix, features_matrix)

In [137]:
movie_titles_to_test = ["Deadpool & Wolverine", "Inside Out 2", "The Garfield Movie"]

# Get Recommendations for each movie
for title in movie_titles_to_test:
    recommendations = get_recommendations(title, cosine_sim, top_n=5)
    print(f"\nRecommendations for {title}:")
    print(recommendations.to_markdown(numalign="left", stralign="left"))


Recommendations for Deadpool & Wolverine:
|      | title                |
|:-----|:---------------------|
| 4445 | The Sentinel         |
| 889  | Snowpiercer          |
| 77   | Godzilla Minus One   |
| 254  | Sonic the Hedgehog 2 |
| 4842 | Five Blind Dates     |

Recommendations for Inside Out 2:
|      | title                |
|:-----|:---------------------|
| 254  | Sonic the Hedgehog 2 |
| 852  | My Spy               |
| 2187 | Les Misérables       |
| 2931 | Last Looks           |
| 718  | Gods of Egypt        |

Recommendations for The Garfield Movie:
|      | title                                             |
|:-----|:--------------------------------------------------|
| 254  | Sonic the Hedgehog 2                              |
| 2187 | Les Misérables                                    |
| 943  | Nowhere                                           |
| 852  | My Spy                                            |
| 4408 | Naruto the Movie: Ninja Clash in the Land of Snow |


**Evaluation Metrics**

In [141]:
import numpy as np 
from sklearn.metrics import precision_score, recall_score

def evaluate_recommendations(df, cosine_sim):
    """
    Evaluates content-based recommendations using precision@k and recall@k.

    Args:
        df: The DataFrame containing movie data.
        cosine_sim: The cosine similarity matrix.
        top_n: Number of recommendations to consider for evaluation.

    Returns:
        Average precision@k and recall@k across all movies in the DataFrame.
    """

    precisions = []
    recalls = []

    for i in range(cosine_sim.shape[0]):
        # Get the movie title
        title = df.iloc[i]['title']

        # Get top 'top_n' recommendations for this movie
        recommendations = get_recommendations(title, cosine_sim, top_n)

        # Get the actual genres of the movie
        actual_genres = set(df.iloc[i]['genres'])

        # Get the genres of the recommended movies
        recommended_genres = set([genre for movie_title in recommendations 
                                  for genre in df[df['title'] == movie_title]['genres'].iloc[0]])

        # Calculate relevance: at least one genre overlap
        relevant_recommendations = len(actual_genres.intersection(recommended_genres))

        # Calculate precision@k and recall@k
        precision = relevant_recommendations / top_n if top_n > 0 else 0
        recall = relevant_recommendations / len(actual_genres) if len(actual_genres) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)

    # Calculate average precision@k and recall@k
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)

    return avg_precision, avg_recall

top_n = 10

# Evaluate recommendations
avg_precision, avg_recall = evaluate_recommendations(df, cosine_sim)
print(f"Average Precision@{top_n}: {avg_precision:.4f}")
print(f"Average Recall@{top_n}: {avg_recall:.4f}")

Average Precision@10: 0.2318
Average Recall@10: 0.8690
