# Approach 1
## Group all the faces that look same together across all the videos, and average the scores of the performance in the dataset using OpenCV and DBSCAN and face_recognition library from python

Note: This was executed on the dataset before the new videos were added. This approach took 4 hours to execute and there were not many faces that were recognised.

In [4]:
import pandas as pd
import requests
import cv2
import face_recognition
import os
from sklearn.cluster import DBSCAN
import numpy as np

In [3]:
pip install face_recognition


Collecting face_recognition
  Using cached face_recognition-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting face-recognition-models>=0.3.0 (from face_recognition)
  Downloading face_recognition_models-0.3.0.tar.gz (100.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting dlib>=19.7 (from face_recognition)
  Downloading dlib-19.24.6.tar.gz (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading face_recognition-1.3.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: dlib, face-recognition-models
  Building wheel for dlib (setup.py) ... [?25ldone
[?25h  Created wheel for dlib: filename=dlib-19.24.6-cp310-cp310-linux_x86_64.whl size=3590656 sha256=cd1a726f670e74d4

In [6]:

os.makedirs("videos", exist_ok=True)
os.makedirs("frames", exist_ok=True)

# Step 1: Load the Dataset
data = pd.read_csv("/kaggle/input/assignment/Assignment Data - Sheet1.csv")

# Step 2: Download Videos
def download_videos(data):
    video_paths = []
    for idx, url in enumerate(data['Video URL']):
        video_path = f"videos/video_{idx}.mp4"
        try:
            response = requests.get(url, stream=True)
            with open(video_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    f.write(chunk)
            video_paths.append(video_path)
        except Exception as e:
            print(f"Error downloading video {url}: {e}")
            video_paths.append(None)
    return video_paths

data['Video Path'] = download_videos(data)

# Step 3: Extract Faces from Videos
def extract_faces_from_videos(video_paths):
    influencer_faces = {}
    for video_path in video_paths:
        if video_path is None:
            continue  # Skip failed downloads
        video_name = os.path.basename(video_path).split('.')[0]
        video = cv2.VideoCapture(video_path)
        frame_count = 0

        while True:
            ret, frame = video.read()
            if not ret:
                break
            frame_count += 1

            # Extract faces every 10 frames for efficiency
            if frame_count % 10 == 0:
                face_locations = face_recognition.face_locations(frame)
                face_encodings = face_recognition.face_encodings(frame, face_locations)

                for i, encoding in enumerate(face_encodings):
                    face_image_path = f"frames/{video_name}_frame{frame_count}_face{i}.jpg"
                    top, right, bottom, left = face_locations[i]
                    face_image = frame[top:bottom, left:right]
                    cv2.imwrite(face_image_path, face_image)

                    # Save face encoding for clustering
                    influencer_faces[face_image_path] = encoding

        video.release()
    return influencer_faces

influencer_faces = extract_faces_from_videos(data['Video Path'])

# Step 4: Cluster Faces (Identify Unique Influencers)
def cluster_faces(face_encodings):
    encodings_list = list(face_encodings.values())
    encodings_array = np.array(encodings_list)

    clustering_model = DBSCAN(metric='euclidean', eps=0.6, min_samples=1)
    labels = clustering_model.fit_predict(encodings_array)

    clustered_faces = {}
    for label, face_path in zip(labels, face_encodings.keys()):
        if label not in clustered_faces:
            clustered_faces[label] = []
        clustered_faces[label].append(face_path)

    return clustered_faces

clustered_faces = cluster_faces(influencer_faces)

# Step 5: Calculate Average Performance per Influencer
def calculate_average_performance(data, clustered_faces):
    performance_by_cluster = {cluster: [] for cluster in clustered_faces.keys()}

    for cluster, face_paths in clustered_faces.items():
        for face_path in face_paths:
            video_name = os.path.basename(face_path).split('_')[0]
            video_performance = data.loc[data['Video Path'].str.contains(video_name, na=False), 'Performance'].values
            if len(video_performance) > 0:
                performance_by_cluster[cluster].append(video_performance[0])

    average_performance = {
        cluster: np.mean(scores) for cluster, scores in performance_by_cluster.items()
    }
    return average_performance

average_performance = calculate_average_performance(data, clustered_faces)

# Step 6: Save Results as Table
results = []
for cluster, face_paths in clustered_faces.items():
    representative_face = face_paths[0] 
    avg_perf = average_performance[cluster]
    results.append((representative_face, avg_perf))

results_df = pd.DataFrame(results, columns=["Representative Face", "Average Performance"])
results_df.to_csv("influencer_performance.csv", index=False)

print("Results saved to influencer_performance.csv")

Results saved to influencer_performance.csv


In [15]:
results_df.head(1)

Unnamed: 0,Representative Face,Average Performance
0,frames/video_0_frame110_face0.jpg,1.106
