In [None]:
pip install deepface openai

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image
import numpy as np
from deepface import DeepFace
import requests
import json
import os
from pathlib import Path
from datetime import datetime
from openai import OpenAI
import time

In [None]:
# Class for Extracting Facial Features using Resnet
class FacialFeatureExtractor(nn.Module):
    def __init__(self, num_features=10):
        super(FacialFeatureExtractor, self).__init__()
        # Use a pretrained ResNet as base
        resnet = models.resnet50(pretrained=True)
        # Remove the final fully connected layer
        self.features = nn.Sequential(*list(resnet.children())[:-1])
        # Add custom layers for specific facial features
        self.fc1 = nn.Linear(2048, 512)
        self.fc2 = nn.Linear(512, num_features)
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # Sigmoid for feature probabilities
        return x

In [None]:
# Given The Image , DeepFace will Analyze for Facial Features
def analyze_facial_features(image_path):

    try:
        analysis = DeepFace.analyze(img_path=image_path, 
                                   actions=['age', 'gender', 'race', 'emotion'],
                                   enforce_detection=False,
                                   detector_backend='retinaface')
        
        # Extract key information
        basic_features = {
            'age': analysis[0]['age'],
            'gender': analysis[0]['gender'],
            'dominant_race': analysis[0]['dominant_race'],
            'dominant_emotion': analysis[0]['dominant_emotion'],
            'emotion_scores': analysis[0]['emotion']
        }
    
        return {**basic_features}
    
    except Exception as e:
        print(f"Error analyzing {image_path}: {str(e)}")
        return None

In [None]:
# Using Actor's Name and API Prompt, Finding Actor Traits based on the Actor's Roles in Different Movies
def get_actor_role_traits(actor_name):
    
    prompt = f"""
    Analyze actor {actor_name}'s most notable roles and identify 20 common character traits 
    they often portray in movies. Focus on personality traits, physical characteristics they're 
    known for, and types of roles they excel in. Format the response as a JSON list of traits.
    """

    token = '#YOUR API KEY'
    endpoint = "https://models.github.ai/inference"
    model_name = "openai/gpt-4o"
    
    try:
        client = OpenAI(
            base_url=endpoint,
            api_key=token,
        )
        
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            temperature=1.0,
            top_p=1.0,
            max_tokens=5000,
            model=model_name
        )
        
        return response.choices[0].message.content
        
    except Exception as e:
        print(f"Error getting traits for {actor_name}: {str(e)}")
        return "API_LIMIT_REACHED"

In [None]:
# From Base Directory, Extract Actor's Name, Actor's Path
def extract_actor_names(base_dir):
    actor_names = []
    actor_paths = {}
    
    for actor_folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, actor_folder)
        if os.path.isdir(folder_path):
            actor_names.append(actor_folder)
            actor_paths[actor_folder] = folder_path
    
    return actor_names, actor_paths

In [None]:
# For each Actor's Image and path, extract its individual Image files and then analyze facial features using DeepFace
def process_actor_images(actor_name, actor_path):
  
    image_features = []
    
    # Get all image files
    image_files = [f for f in os.listdir(actor_path) 
                  if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    for img_file in image_files:
        img_path = os.path.join(actor_path, img_file)
        features = analyze_facial_features(img_path)
        if features:
            image_features.append(features)
    
    return image_features

In [None]:
#Given Facial Features from DeepFace analysis, Aggregate features will give a cumulitive feature from all the images gathered

def aggregate_features(facial_features):
  
    if not facial_features:
        return {}
    
    # Initialize aggregated features
    aggregated = {}
    
    # Numeric features to average
    numeric_features = ['age', 'eye_intensity', 'face_seriousness', 'strong_look']
    
    # Categorical features to take most common
    categorical_features = ['gender', 'dominant_race', 'dominant_emotion']
    
    # Boolean features (handle separately)
    boolean_features = ['has_beard']
    
    # Process numeric features
    for feature in numeric_features:
        if all(feature in item for item in facial_features):
            aggregated[feature] = sum(item[feature] for item in facial_features) / len(facial_features)
    
    # Process categorical features
    for feature in categorical_features:
        if all(feature in item for item in facial_features):
            # Count occurrences
            counts = {}
            for item in facial_features:
                value = item[feature]
                # Make sure value is hashable (not a dict)
                if not isinstance(value, dict):
                    counts[value] = counts.get(value, 0) + 1
            
            # Find most common if counts is not empty
            if counts:
                most_common = max(counts.items(), key=lambda x: x[1])[0]
                aggregated[feature] = most_common
    
    # Process boolean features
    for feature in boolean_features:
        if all(feature in item for item in facial_features):
            # Count True values
            true_count = sum(1 for item in facial_features if item[feature])
            # Set to True if majority are True
            aggregated[feature] = true_count > len(facial_features) / 2
    
    # Special handling for emotion scores
    if all('emotion_scores' in item for item in facial_features):
        emotion_scores = {}
        # Get all possible emotions from the first item
        if facial_features and len(facial_features) > 0 and 'emotion_scores' in facial_features[0]:
            emotions = facial_features[0]['emotion_scores'].keys()
            
            for emotion in emotions:
                # Calculate average score for each emotion
                scores = [item['emotion_scores'].get(emotion, 0) for item in facial_features]
                emotion_scores[emotion] = sum(scores) / len(scores)
            
            aggregated['emotion_scores'] = emotion_scores
    # Add gender feature explicitly 
    if all('gender' in item for item in facial_features):
        gender_counts = {}
        for item in facial_features:
            gender = item['gender']
            # Check if gender is a dictionary
            if isinstance(gender, dict):
                # Either extract a specific value from the dictionary
                # or convert it to a string representation
                gender = str(gender)  # or some other appropriate conversion
            gender_counts[gender] = gender_counts.get(gender, 0) + 1
    
    return aggregated

In [None]:
# Save Current Progress
def save_progress(processed_actors):

    # Save processed actors list
    with open("processed_actors.json", "w") as f:
        json.dump(processed_actors, f, indent=2)
    
    # Save checkpoint with timestamp
    checkpoint = {
        "timestamp": datetime.now().isoformat(),
        "processed_count": len(processed_actors)
    }
    
    with open("actor_processing_checkpoint.json", "w") as f:
        json.dump(checkpoint, f, indent=2)

In [None]:
# Processing Actors with given base_dir
def process_actors_batch(base_dir):
    
    actor_names, actor_paths = extract_actor_names(base_dir)
    
    processed_actors = {}
    if os.path.exists("/kaggle/input/actorsfinal/cleaned_data.json"):
        with open("/kaggle/input/actorsfinal/cleaned_data.json", "r") as f:
            processed_actors = json.load(f)
    
    # Load the complete list of actors
    all_actors = []
    with open("/kaggle/input/indian-actor-images-dataset/List of Actors.txt", "r") as f:
        all_actors = [line.strip() for line in f if line.strip()]

    print(processed_actors.keys())
    processed_actor_names = set(processed_actors.keys())
    remaining_actors = [actor for actor in all_actors if actor not in processed_actor_names]
    
    print(f"Total actors in list: {len(all_actors)}")
    print(f"Already processed: {len(processed_actor_names)}")
    print(f"Remaining to process: {len(remaining_actors)}")
    
    for actor in remaining_actors:
        print(f"Processing actor: {actor}")
        
        # Get actor's common role traits
        role_traits = get_actor_role_traits(actor)
        
        # Check if API limit was reached
        if role_traits == "API_LIMIT_REACHED":
            print("Stopping processing due to API limit.")
            break
        
        # Process all images and extract facial features
        facial_features = process_actor_images(actor, actor_paths[actor])
        
        # Create comprehensive profile
        processed_actors[actor] = {
            'role_traits': role_traits,
            'aggregated_features': aggregate_features(facial_features)
        }
        
        # Save progress after each actor
        save_progress(processed_actors)

        print(f"Completed processing {actor}")
        
        # Add a small delay to avoid hitting rate limits too quickly
        time.sleep(1)
    
    return processed_actors

In [None]:
def main():
    
    base_dir = "/kaggle/input/indian-actor-images-dataset/Bollywood Actor Images/Bollywood Actor Images"

    # Process actors until limit reached
    processed_actors = process_actors_batch(
        base_dir, 
    )

    # Output current database status
    print(f"\nCurrent database contains {len(processed_actors)} actors")
    print(f"Remaining actors will be processed in the next run")
    

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# Role traits are not in clean format and are in String format, therefore converting them to JSON format

import json
import re

# Load the raw JSON file
with open("/kaggle/input/actorsfinal/processed_actors.json", "r") as file:
    data = json.load(file)

# Iterate through each key (actor)
for name, details in data.items():
    raw_traits = details.get("role_traits", None)

    # Proceed only if role_traits is a string and contains list structure
    if isinstance(raw_traits, str):
        # Use regex to extract the JSON list part (between first '[' and last ']')
        match = re.search(r'\[\s*{.*?}\s*\]', raw_traits, re.DOTALL)
        if match:
            json_like_str = match.group(0)
            try:
                parsed_traits = json.loads(json_like_str)
                data[name]["role_traits"] = parsed_traits
            except json.JSONDecodeError as e:
                print(f"[JSONDecodeError] Could not parse 'role_traits' for {name}: {e}")
        else:
            print(f"[Warning] No JSON array found in 'role_traits' for {name}")
    else:
        print(f"[Info] Skipped {name} (already parsed or not a string)")

# Save the cleaned data
with open("cleaned_data.json", "w") as out_file:
    json.dump(data, out_file, indent=4)

print("Cleaning complete. Output written to 'cleaned_data.json'")


In [None]:
pip install transformers

**Here we are generating a summary parameter in json file having a summary of all the traits**

In [None]:
import json
from transformers import pipeline

# Load the summarizer model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load the input data
with open("/kaggle/input/actorsfinal/cleaned_data.json", "r") as f:
    data = json.load(f)

# Loop through each actor
for name, details in data.items():
    summary_input_parts = []

    # Role Traits
    role_traits = details.get("role_traits", [])
    if isinstance(role_traits, list):
        trait_text = " ".join([f"{trait['trait']}: {trait['description']}" for trait in role_traits])
        summary_input_parts.append(trait_text)

    # Aggregated Features
    agg = details.get("aggregated_features", {})
    if agg:
        age = agg.get("age", None)
        dom_race = agg.get("dominant_race", None)
        dom_emotion = agg.get("dominant_emotion", None)

        # Basic summary of features
        feature_text = f"Age: {age}. Dominant Race: {dom_race}. Dominant Emotion: {dom_emotion}."
        summary_input_parts.append(feature_text)

        # Optional: Add full emotion scores
        scores = agg.get("emotion_scores", {})
        if scores:
            score_text = "Emotion Scores - " + ", ".join([f"{k}: {round(v, 1)}%" for k, v in scores.items()])
            summary_input_parts.append(score_text)

    # Combine all into one summarizable block
    final_text = " ".join(summary_input_parts)
    final_text = final_text[:3000]  # limit input size if needed
    print(f"For {name}:  {final_text}")

    try:
        summary = summarizer(final_text, max_length=300, min_length=150, do_sample=False)[0]["summary_text"]
        data[name]["summary"] = summary
        print(f"[✓] Summary added for {name}")
        print(summary)
    except Exception as e:
        print(f"[✗] Failed to summarize for {name}: {e}")

# Save to file
with open("summarized_full_data.json", "w") as f:
    json.dump(data, f, indent=4)

print(" Summarization complete. Output saved to 'summarized_full_data.json'")

In [None]:
pip install sentence-transformers


In [None]:
# In this we are embedding the summary and user query and finding the best cosine similarity.

import json
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Load summaries
with open("/kaggle/working/summarized_full_data.json", "r") as f:
    actor_data = json.load(f)

# Load sentence embedding model
model = SentenceTransformer("all-mpnet-base-v2")

# Extract summaries and corresponding actor names
actor_summaries = {actor: details['summary'] for actor, details in data.items()}

# Function to recommend actors
def recommend_actors(user_description, top_k=5):
    # Get embedding for user query
    query_embedding = model.encode(user_description, convert_to_tensor=True)

    # Compute similarity between user query and each actor's summary
    similarities = []
    for actor, summary in actor_summaries.items():
        summary_embedding = model.encode(summary, convert_to_tensor=True)
        score = util.cos_sim(query_embedding, summary_embedding).item()
        similarities.append((actor, score))

    # Sort by similarity score and return top matches
    top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]
    return top_matches

query = "Army look, strong and bold personality and voice, Have aggressive eyes"

top_actors = recommend_actors(query)
print("Recommended Actors:")
for actor, score in top_actors:
    print(f"{actor}: similarity = {score:.4f}")


In [None]:
# 2nd WAY OF RECOMMENDATION i.e. just matching from JSON format instead of summaries

def recommend_actors(actor_profiles, role_requirements, top_n=3):

    scores = {}
    
    # Parse role requirements
    req_traits = role_requirements.get('traits', [])
    req_facial = role_requirements.get('facial_features', {})
    
    for actor_name, profile in actor_profiles.items():
        score = 0
        
        # Score based on role traits
        actor_traits = profile['role_traits']
        for trait in req_traits:
            if any(trait.lower() in actor_trait['trait'].lower() for actor_trait in actor_traits):
                score += 1
        
        # Score based on facial features
        agg_features = profile['aggregated_features']
        
        # Age proximity (if specified)
        if 'age' in req_facial and 'age' in agg_features:
            age_diff = abs(req_facial['age'] - agg_features['age'])
            # Convert age difference to a score (closer is better)
            age_score = max(0, 1 - (age_diff / 50))  # Normalize by 50 years
            score += age_score * 2  # Weight age more heavily
        
        # Gender match (if specified)
        if 'gender' in req_facial and 'gender' in agg_features:
            if req_facial['gender'].lower() == agg_features['gender'].lower():
                score += 2
        
        # Emotion match (if specified)
        if 'dominant_emotion' in req_facial and 'dominant_emotion' in agg_features:
            if req_facial['dominant_emotion'].lower() == agg_features['dominant_emotion'].lower():
                score += 1
        
        # Other facial features
        for feature in ['has_beard', 'eye_intensity', 'face_seriousness', 'strong_look']:
            if feature in req_facial and feature in agg_features:
                if isinstance(agg_features[feature], bool):
                    # Boolean feature
                    if req_facial[feature] == agg_features[feature]:
                        score += 1
                else:
                    # Numeric feature - calculate proximity (closer is better)
                    diff = abs(req_facial[feature] - agg_features[feature])
                    feature_score = max(0, 1 - diff)  # Normalize to 0-1
                    score += feature_score
        
        scores[actor_name] = score
    
    # Sort actors by score and return top N
    top_actors = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return top_actors

In [None]:
import os 

actor_profiles = {}
    
# Load actor profiles if exists
if os.path.exists("/kaggle/input/actorsfinal/cleaned_data.json"):
    with open("/kaggle/input/actorsfinal/cleaned_data.json", "r") as f:
        actor_profiles = json.load(f)

# Example role requirement for testing recommendations
if len(actor_profiles) > 0:
    example_role = {
        'traits': ['intense', 'authoritative', 'Strong and bold voice', 'Rajputana Look'],
        'facial_features': {
            'age': 40,
            'gender': 'Male',
            'has_beard': False,
            'eye_intensity': 0.8,
            'face_seriousness': 0.7,
            'strong_look': 0.9,
            'dominant_emotion': 'angry'
        }
    }
    
    # Get recommendations
    print("\nFinding best actors for the role...")
    recommendations = recommend_actors(actor_profiles, example_role, top_n=3)
    
    # Display recommendations
    print("\nTop Recommended Actors:")
    for i, (actor, score) in enumerate(recommendations, 1):
        print(f"{i}. {actor} (Score: {score:.2f})")
    

** Since The above 2 recommendations are giving different result , therefore approach is to get Hybrid of these 2 recommendation functions and combined score, then result of Top n actors**

In [None]:
# Load structured profiles
with open("/kaggle/input/actorsfinal/cleaned_data.json", "r") as f:
    actor_profiles = json.load(f)

# Load summarized text data
with open("/kaggle/working/summarized_full_data.json", "r") as f:
    actor_data = json.load(f)
actor_summaries = {actor: details['summary'] for actor, details in actor_data.items()}

# Load sentence transformer model
model = SentenceTransformer("all-mpnet-base-v2")


# ------------------------------
# Structured Matching Function
# ------------------------------
def compute_trait_score(actor_traits, query_traits):
    match_count = sum(1 for qt in query_traits if any(qt.lower() in at['trait'].lower() for at in actor_traits))
    return match_count / len(query_traits) if query_traits else 0


def compute_feature_score(actor_features, query_features):
    score = 0
    max_score = 0

    # Age similarity
    if 'age' in actor_features and 'age' in query_features:
        age_diff = abs(actor_features['age'] - query_features['age'])
        age_score = max(0, 1 - age_diff / 30)  # normalize
        score += age_score
        max_score += 1

    # Dominant emotion match
    if 'dominant_emotion' in actor_features and 'dominant_emotion' in query_features:
        score += int(actor_features['dominant_emotion'] == query_features['dominant_emotion'])
        max_score += 1

    return score / max_score if max_score else 0


def get_structured_score(actor_name, role):
    traits_score = compute_trait_score(actor_profiles[actor_name]['role_traits'], role['traits'])
    features_score = compute_feature_score(actor_profiles[actor_name]['aggregated_features'], role['facial_features'])
    return (traits_score + features_score) / 2


# ------------------------------
# Embedding Similarity Function
# ------------------------------
def get_summary_similarity(actor_name, role_text):
    actor_summary = actor_summaries.get(actor_name, "")
    if not actor_summary:
        return 0
    query_embedding = model.encode(role_text, convert_to_tensor=True)
    summary_embedding = model.encode(actor_summary, convert_to_tensor=True)
    return util.cos_sim(query_embedding, summary_embedding).item()


# ------------------------------
# Final Recommender
# ------------------------------
def hybrid_recommend_actors(role, role_text_description, top_n=5, w_structured=0.6, w_text=0.4):
    scores = []

    for actor_name in actor_profiles.keys():
        structured_score = get_structured_score(actor_name, role)
        text_score = get_summary_similarity(actor_name, role_text_description)
        final_score = w_structured * structured_score + w_text * text_score #Here taking 60% weightage to structured data ad 40% weightage to Summary data
        scores.append((actor_name, final_score))

    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]


example_role = {
    'traits': ['intense', 'authoritative', 'Strong and bold voice', 'Rajputana Look'],
    'facial_features': {
        'age': 40,
        'gender': 'Male',
        'has_beard': False,
        'eye_intensity': 0.8,
        'face_seriousness': 0.7,
        'strong_look': 0.9,
        'dominant_emotion': 'angry'
    }
}
role_text = "Army look, strong and bold personality and voice, aggressive eyes, Rajputana heritage"

print("\n Top Hybrid Recommended Actors:")
top_actors = hybrid_recommend_actors(example_role, role_text)
for i, (actor, score) in enumerate(top_actors, 1):
    print(f"{i}. {actor} — Score: {score:.4f}")