# Movie Data Processor
**Created by:** Ramji-Purwar  
**Date:** 2025-05-19 15:42:57

In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

In [None]:
file_path = 'filtered_data.csv'
movies_df = pd.read_csv(file_path)
movies_df.head()

## 1. Process Actors

In [None]:
def process_actors():
    actor_movie_relations = []
    
    actor_ids = {}
    current_actor_id = 1
    
    for _, movie in movies_df.iterrows():
        movie_id = movie['movie_id']
        
        if pd.notna(movie['top_5_actors']):
            actors = [actor.strip() for actor in movie['top_5_actors'].split(',')]
            
            for actor in actors:
                # Assign an ID if this actor doesn't have one yet
                if actor not in actor_ids:
                    actor_ids[actor] = f"act_{current_actor_id}"
                    current_actor_id += 1
                    
                # Add this actor-movie relationship
                actor_movie_relations.append({
                    'actor_id': actor_ids[actor],
                    'actor_name': actor,
                    'movie_id': movie_id
                })
    
    actors_df = pd.DataFrame(list(set([(actor_ids[name], name) for name in actor_ids])), 
                           columns=['actor_id', 'actor_name'])
    
    actor_movies_df = pd.DataFrame(actor_movie_relations)[['actor_id', 'movie_id']]
    
    return actors_df, actor_movies_df

In [None]:
actors_df, actor_movies_df = process_actors()
actor_movies_df.to_csv('actor_movies.csv', index=False)

actors_df.to_csv('actors.csv', index=False)
actor_movies_df.head()

## 2. Process Genres

In [None]:
def process_genres():
    genre_movie_relations = []
    
    all_genres = set()
    
    for _, movie in movies_df.iterrows():
        movie_id = movie['movie_id']
        
        if pd.notna(movie['genre']):
            genres = [genre.strip() for genre in movie['genre'].split(',')]
            
            for genre in genres:
                all_genres.add(genre)
                
                genre_movie_relations.append({
                    'genre': genre,
                    'movie_id': movie_id
                })
    
    genres_list = list(all_genres)
    genres_df = pd.DataFrame({
        'genre_id': [f"gen_{i+1}" for i in range(len(genres_list))],
        'genre_name': genres_list
    })
    
    genre_to_id = dict(zip(genres_df['genre_name'], genres_df['genre_id']))
    
    movie_genres_df = pd.DataFrame(genre_movie_relations)
    movie_genres_df['genre_id'] = movie_genres_df['genre'].map(genre_to_id)
    movie_genres_df = movie_genres_df[['movie_id', 'genre_id']]
    
    return genres_df, movie_genres_df

In [None]:
genres_df, movie_genres_df = process_genres()
genres_df.to_csv('genres.csv', index=False)

movie_genres_df.to_csv('movie_genres.csv', index=False)
genres_df.head()

## 3. Process Directors

In [None]:
def process_directors():
    # Create a list to hold all director-movie relationships
    director_movie_relations = []
    
    # Create a dictionary to store director IDs
    director_ids = {}
    current_director_id = 1
    
    # Process each movie
    for _, movie in movies_df.iterrows():
        movie_id = movie['movie_id']
        
        # Process the director field
        if pd.notna(movie['director']):
            # Handle cases where there are multiple directors
            directors = [director.strip() for director in movie['director'].split(',')]
            
            for director in directors:
                # Assign an ID if this director doesn't have one yet
                if director not in director_ids:
                    director_ids[director] = f"dir_{current_director_id}"
                    current_director_id += 1
                    
                # Add this director-movie relationship
                director_movie_relations.append({
                    'director_id': director_ids[director],
                    'director_name': director,
                    'movie_id': movie_id
                })
    
    # Create dataframes
    directors_df = pd.DataFrame(list(set([(director_ids[name], name) for name in director_ids])), 
                           columns=['director_id', 'director_name'])
    
    director_movies_df = pd.DataFrame(director_movie_relations)[['director_id', 'movie_id']]
    
    return directors_df, director_movies_df

In [None]:
directors_df, director_movies_df = process_directors()
director_movies_df.to_csv('director_movies.csv', index=False)

directors_df.to_csv('directors.csv', index=False)
director_movies_df.head()

## 4. Process Keywords

In [None]:
def extract_keywords(text, max_keywords=10):
    # Handle missing values
    if pd.isna(text):
        return []
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Simple word extraction using regex
    # Find all words with at least 3 characters
    words = re.findall(r'\b[a-z]{3,}\b', text)
    
    # Common English stopwords to filter out
    stopwords = {'the', 'and', 'for', 'with', 'his', 'her', 'they', 'this', 'that', 
                'are', 'was', 'were', 'from', 'has', 'have', 'had', 'not', 'but', 
                'who', 'what', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 
                'each', 'few', 'more', 'most', 'other', 'some', 'such', 'own', 'than', 
                'too', 'very', 'can', 'will', 'just', 'into', 'onto', 'our', 'their',
                'about', 'after', 'before', 'between', 'over', 'under', 'again', 'further',
                'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
                'any', 'both', 'each', 'few', 'more', 'most', 'some', 'such', 'only', 'own',
                'same', 'saw', 'also', 'ever', 'who', 'which', 'well'}
    
    # Filter out stopwords
    filtered_words = [word for word in words if word not in stopwords]
    
    # Count word frequencies
    from collections import Counter
    word_counts = Counter(filtered_words)
    
    # Get most common words as keywords
    keywords = [word for word, _ in word_counts.most_common(max_keywords)]
    
    return keywords

In [None]:
def process_keywords():
    # Create a list to hold all movie-keyword relationships
    movie_keyword_relations = []
    
    # Process each movie
    for _, movie in movies_df.iterrows():
        movie_id = movie['movie_id']
        
        # Extract keywords from description
        if pd.notna(movie['description']):
            keywords = extract_keywords(movie['description'])
            
            for keyword in keywords:
                # Add this movie-keyword relationship
                movie_keyword_relations.append({
                    'movie_id': movie_id,
                    'keyword': keyword
                })
    
    # Create the movie_keywords dataframe
    if movie_keyword_relations:
        movie_keywords_df = pd.DataFrame(movie_keyword_relations)
    else:
        movie_keywords_df = pd.DataFrame(columns=['movie_id', 'keyword'])
    
    return movie_keywords_df

In [None]:
# Generate the movie_keywords dataframe
movie_keywords_df = process_keywords()

# Save movie_keywords.csv
movie_keywords_df.to_csv('movie_keywords.csv', index=False)

print(f"Created movie_keywords.csv with {len(movie_keywords_df)} entries")

# Display samples
movie_keywords_df.head()

## 5. Simple Recommendation System

In [None]:
def get_similar_movies(movie_id, top_n=5):
    """
    Find similar movies based on shared genres, actors, directors, and keywords.
    """
    # Get the genres for this movie
    movie_genres = movie_genres_df[movie_genres_df['movie_id'] == movie_id]['genre_id'].tolist()
    
    # Get the actors for this movie
    movie_actors = actor_movies_df[actor_movies_df['movie_id'] == movie_id]['actor_id'].tolist()
    
    # Get the directors for this movie
    movie_directors = director_movies_df[director_movies_df['movie_id'] == movie_id]['director_id'].tolist()
    
    # Get the keywords for this movie
    movie_kws = movie_keywords_df[movie_keywords_df['movie_id'] == movie_id]['keyword'].tolist()
    
    # Dictionary to store similarity scores
    similarity_scores = defaultdict(int)
    
    # Find movies with the same genres
    for genre_id in movie_genres:
        similar_genre_movies = movie_genres_df[movie_genres_df['genre_id'] == genre_id]['movie_id'].tolist()
        for similar_movie in similar_genre_movies:
            if similar_movie != movie_id:  # Don't include the original movie
                similarity_scores[similar_movie] += 2  # Weight for genre match
    
    # Find movies with the same actors
    for actor_id in movie_actors:
        similar_actor_movies = actor_movies_df[actor_movies_df['actor_id'] == actor_id]['movie_id'].tolist()
        for similar_movie in similar_actor_movies:
            if similar_movie != movie_id:
                similarity_scores[similar_movie] += 3  # Weight for actor match
    
    # Find movies with the same directors
    for director_id in movie_directors:
        similar_director_movies = director_movies_df[director_movies_df['director_id'] == director_id]['movie_id'].tolist()
        for similar_movie in similar_director_movies:
            if similar_movie != movie_id:
                similarity_scores[similar_movie] += 4  # Weight for director match
    
    # Find movies with the same keywords
    for keyword in movie_kws:
        similar_keyword_movies = movie_keywords_df[movie_keywords_df['keyword'] == keyword]['movie_id'].tolist()
        for similar_movie in similar_keyword_movies:
            if similar_movie != movie_id:
                similarity_scores[similar_movie] += 1  # Weight for keyword match
    
    # Sort by similarity score
    similar_movies = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    # Get the movie details
    recommended_movies = []
    for similar_movie_id, score in similar_movies:
        try:
            movie_details = movies_df[movies_df['movie_id'] == similar_movie_id].iloc[0]
            recommended_movies.append({
                'movie_id': similar_movie_id,
                'movie_name': movie_details['movie_name'],
                'similarity_score': score
            })
        except IndexError:
            # Skip if movie details not found
            continue
    
    return recommended_movies

In [None]:
# Example usage - test with a sample movie
if len(movies_df) > 0:
    test_movie_id = movies_df.iloc[0]['movie_id']  # Using the first movie in the dataset
    test_movie_name = movies_df.iloc[0]['movie_name']
    
    print(f"Finding movies similar to: {test_movie_name} ({test_movie_id})")
    recommendations = get_similar_movies(test_movie_id)
    
    print("\nRecommended Movies:")
    for i, movie in enumerate(recommendations, 1):
        print(f"{i}. {movie['movie_name']} (Score: {movie['similarity_score']})")

## Summary

This notebook has created the following files for our movie recommendation website:

1. **actor_movies.csv**: Links actors to movies
2. **actors.csv**: List of all actors with IDs
3. **genres.csv**: List of all genres with IDs
4. **movie_genres.csv**: Maps movies to genres
5. **director_movies.csv**: Maps directors to movies
6. **directors.csv**: List of all directors with IDs
7. **movie_keywords.csv**: Keywords extracted from movie descriptions

These files provide the foundation for our movie recommendation system.