## Tech Challenge - Reconhecimento facial
**Aluno:** Vin√≠cius Oliveira Litran Andrade

## üì¶ Importa√ß√µes:

In [1]:
import cv2  # OpenCV: manipula√ß√£o de v√≠deos e imagens
import numpy as np  # Numpy: opera√ß√µes matem√°ticas e manipula√ß√£o de arrays
import math  # Math: fun√ß√µes matem√°ticas como dist√¢ncia euclidiana
from deepface import DeepFace  # Biblioteca de reconhecimento e an√°lise facial
from collections import Counter, deque  # Contador e fila circular (deque) para hist√≥rico
import mediapipe as mp  # MediaPipe: detec√ß√£o de poses corporais
from tqdm import tqdm  # Barra de progresso elegante
import torch  # PyTorch: para checar disponibilidade de GPU
from transformers import pipeline  # HuggingFace Transformers: sumariza√ß√£o textual
from PIL import ImageFont, ImageDraw, Image  # Pillow: texto com suporte a acentua√ß√£o
import os  # M√≥dulo para opera√ß√µes no sistema de arquivos




In [2]:
# Caminho para fonte com suporte a acentos
FONT_PATH = r"C:\Windows\Fonts\arial.ttf"

In [3]:
# Dicion√°rio de tradu√ß√£o de emo√ß√µes
EMOTION_TRANSLATION = {
    "happy": "Feliz", "sad": "Triste", "angry": "Bravo",
    "surprise": "Surpreso", "fear": "Assustado",
    "disgust": "Com nojo", "neutral": "Neutro"
}

In [4]:
def put_text_accented(frame, text_data, position=None, font_path="C:\Windows\Fonts\arial.ttf", font_size=24, color=(0, 255, 0)):
    """
    Adiciona texto com acentos em um frame OpenCV utilizando Pillow.

    Par√¢metros:
        frame: Imagem (frame) no formato BGR.
        text_data: String (texto √∫nico) ou lista de tuplas (texto, posi√ß√£o).
        position: Tupla (x, y) da posi√ß√£o do texto (se text_data for string).
        font_path: Caminho para a fonte .ttf.
        font_size: Tamanho da fonte.
        color: Cor em formato BGR.
    
    Retorna:
        Frame com texto renderizado corretamente.
    """
    img_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(img_pil)

    try:
        font = ImageFont.truetype(font_path, font_size)
    except:
        font = ImageFont.load_default()

    if isinstance(text_data, list):
        for text, pos in text_data:
            draw.text(pos, text, font=font, fill=color[::-1])
    else:
        if position is None:
            position = (10, 10)
        draw.text(position, text_data, font=font, fill=color[::-1])

    return cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

## üß† Classe Principal: VideoAnalyzer
Respons√°vel por processar o v√≠deo, detectar emo√ß√µes, atividades e anomalias, al√©m de salvar os resultados e gerar um resumo textual.

In [5]:
class VideoAnalyzer:
    def __init__(self,
                 video_path="Unlocking Facial Recognition_ Diverse Activities Analysis.mp4",
                 output_video_path="output_video.mp4",
                 summary_path="video_summary.txt",
                 emotion_backend="retinaface",
                 anomaly_threshold_ratio=0.15,
                 frame_skip=1,
                 resize_scale=1,
                 emotion_history_length=20,
                 anomaly_movement_threshold=100,
                 missing_face_tolerance=3,
                 stability_threshold=4
                ):
        # Par√¢metros configur√°veis
        self.VIDEO_PATH = video_path  # Caminho do v√≠deo de entrada
        self.OUTPUT_VIDEO_PATH = output_video_path  # Caminho do v√≠deo anotado
        self.SUMMARY_PATH = summary_path  # Caminho do arquivo de resumo (.txt)
        self.EMOTION_BACKEND = emotion_backend  # Backend do DeepFace
        self.ANOMALY_THRESHOLD_RATIO = anomaly_threshold_ratio  # √Årea m√≠nima para considerar anomalia facial
        self.FRAME_SKIP = frame_skip  # Pula frames para melhorar desempenho
        self.RESIZE_SCALE = resize_scale  # Redimensionamento do frame (ajuda no DeepFace)
        self.EMOTION_HISTORY_LENGTH = emotion_history_length  # Tamanho do buffer de emo√ß√µes para suaviza√ß√£o
        self.ANOMALY_MOVEMENT_THRESHOLD = anomaly_movement_threshold  # Threshold para detectar movimento an√¥malo
        self.MISSING_FACE_TOLERANCE = missing_face_tolerance  # N√∫mero de frames sem face antes de considerar "perdido"
        self.STABILITY_THRESHOLD = stability_threshold  # N√∫mero de frames iguais para considerar emo√ß√£o est√°vel

        # Inicializa√ß√£o dos m√≥dulos
        self.mp_pose = mp.solutions.pose  # M√≥dulo de pose corporal do MediaPipe
        self.mp_drawing = mp.solutions.drawing_utils  # Utilit√°rio para desenhar conex√µes entre landmarks
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # Carrega modelo pr√©-treinado para sumariza√ß√£o de texto

        # Checagem de hardware
        if torch.cuda.is_available():
            print("‚úÖ GPU dispon√≠vel...")
        else:
            print("‚ö†Ô∏è GPU n√£o dispon√≠vel...")

        # Estado interno
        self.emotion_history = deque(maxlen=self.EMOTION_HISTORY_LENGTH)  # Hist√≥rico recente de emo√ß√µes
        self.missing_face_counter = 0  # Contador de frames sem face detectada
        self.pose = self.mp_pose.Pose()  # Inicializa detec√ß√£o de pose corporal



    def extract_landmarks(self, results, frame_shape):
        """
        Extrai e retorna landmarks relevantes da pose detectada no frame.
        """
        if not results.pose_landmarks:
            return None
        landmarks = {}
        for idx, lm in enumerate(results.pose_landmarks.landmark):
            if lm.visibility > 0.6:
                name = self.mp_pose.PoseLandmark(idx).name
                landmarks[name] = (int(lm.x * frame_shape[1]), int(lm.y * frame_shape[0]))
        return landmarks

    def clamp_coords(self, x, y, w, h, frame_shape):
        """
        Garante que coordenadas n√£o extrapolem os limites da imagem.
        """
        max_x = frame_shape[1] - 1
        max_y = frame_shape[0] - 1
        x = int(np.clip(x, 0, max_x))
        y = int(np.clip(y, 0, max_y))
        w = int(np.clip(w, 1, max_x - x))
        h = int(np.clip(h, 1, max_y - y))
        return x, y, w, h

    def analyze_emotions(self, small_frame, full_frame, emotion_summary, anomalies_detected, frame_area):
        """
        Analisa emo√ß√µes detectadas via DeepFace, atualiza contadores e desenha no frame.
        """
        try:
            analysis = DeepFace.analyze(
                small_frame,
                actions=['emotion'],
                enforce_detection=False,
                detector_backend=self.EMOTION_BACKEND
            )

            if isinstance(analysis, dict):
                analysis = [analysis]

            emotion_texts = []

            for result in analysis:
                region = result.get("region")
                emotion = result.get("dominant_emotion")
                if not region or not emotion:
                    continue

                x = int(region['x'] / self.RESIZE_SCALE)
                y = int(region['y'] / self.RESIZE_SCALE)
                w = int(region['w'] / self.RESIZE_SCALE)
                h = int(region['h'] / self.RESIZE_SCALE)

                x, y, w, h = self.clamp_coords(x, y, w, h, full_frame.shape)
                area_face = w * h

                if area_face > self.ANOMALY_THRESHOLD_RATIO * frame_area:
                    anomalies_detected += 1

                emotion_pt = EMOTION_TRANSLATION.get(emotion.lower(), emotion)
                emotion_summary[emotion_pt] += 1

                cv2.rectangle(full_frame, (x, y), (x + w, y + h), (255, 0, 0), 2)

                emotion_texts.append((emotion_pt, (x, max(y - 30, 0))))

            if emotion_texts:
                full_frame = put_text_accented(full_frame, emotion_texts, font_path=FONT_PATH, font_size=24, color=(0,255,0))

            if not analysis or all(r.get("region") is None for r in analysis):
                self.missing_face_counter += 1
            else:
                self.missing_face_counter = 0

        except Exception as e:
            print(f"‚ö†Ô∏è Falha na an√°lise emocional: {str(e)}")

        return anomalies_detected, full_frame



    def detect_activity_from_landmarks(self, landmarks, image_shape):
        """
        Detecta atividades b√°sicas a partir dos landmarks do corpo.
        """
        activities = []
        h, w = image_shape

        def get_landmark(name):
            return landmarks.get(name, None)

        def distance(p1, p2):
            if p1 and p2:
                return math.hypot(p1[0] - p2[0], p1[1] - p2[1])
            return float('inf')

        def is_near(p1, p2, threshold=0.05):
            return distance(p1, p2) < threshold * w

        def is_above(p1, p2, offset=0.1):
            return p1 and p2 and p1[1] < p2[1] - offset * h

        def is_aligned_vertically(p1, p2, tolerance=0.05):
            return p1 and p2 and abs(p1[0] - p2[0]) < tolerance * w

        left_shoulder = get_landmark('LEFT_SHOULDER')
        right_shoulder = get_landmark('RIGHT_SHOULDER')
        left_hip = get_landmark('LEFT_HIP')
        right_hip = get_landmark('RIGHT_HIP')
        left_wrist = get_landmark('LEFT_WRIST')
        right_wrist = get_landmark('RIGHT_WRIST')
        left_hand = get_landmark('LEFT_INDEX')
        right_hand = get_landmark('RIGHT_INDEX')
        nose = get_landmark('NOSE')
        mouth_left = get_landmark('MOUTH_LEFT')
        mouth_right = get_landmark('MOUTH_RIGHT')
        left_eye = get_landmark('LEFT_EYE')
        right_eye = get_landmark('RIGHT_EYE')

        # Postura: sentado / em p√©
        if left_shoulder and right_shoulder and left_hip and right_hip:
            shoulder_y = (left_shoulder[1] + right_shoulder[1]) / 2
            hip_y = (left_hip[1] + right_hip[1]) / 2
            torso_length = abs(hip_y - shoulder_y)
            if hip_y > shoulder_y + 0.12 * h and torso_length > 0.25 * h:
                activities.append("sentado")
            elif torso_length > 0.35 * h and abs(hip_y - shoulder_y) < 0.15 * h:
                activities.append("em p√©")

        # Gestos manuais
        if is_above(left_wrist, left_shoulder) and left_wrist[1] < 0.5 * h:
            activities.append("acenando com a m√£o esquerda")
        if is_above(right_wrist, right_shoulder) and right_wrist[1] < 0.5 * h:
            activities.append("acenando com a m√£o direita")

        if left_wrist and right_shoulder and abs(left_wrist[1] - right_shoulder[1]) < 0.07 * h and left_wrist[1] > 0.4 * h:
            activities.append("gesticulando com a m√£o esquerda")
        if right_wrist and left_shoulder and abs(right_wrist[1] - left_shoulder[1]) < 0.07 * h and right_wrist[1] > 0.4 * h:
            activities.append("gesticulando com a m√£o direita")

        if left_wrist and right_wrist and abs(left_wrist[0] - right_wrist[0]) > 0.35 * w and abs(left_wrist[1] - right_wrist[1]) < 0.15 * h:
            activities.append("dan√ßando")

        if left_hand and right_hand and is_near(left_hand, right_hand):
            if abs(left_hand[1] - right_hand[1]) < 0.05 * h:
                activities.append("apertando as m√£os")

        # Bocejo: boca aberta verticalmente e boca estreita horizontalmente
        if nose and mouth_left and mouth_right:
            mouth_center_y = (mouth_left[1] + mouth_right[1]) / 2
            mouth_width = distance(mouth_left, mouth_right)
            mouth_height = abs(nose[1] - mouth_center_y)
            aspect_ratio = mouth_height / (mouth_width + 1e-6)
            if mouth_height > 0.12 * h and aspect_ratio > 0.65:
                activities.append("bocejando")

        # Riso: boca larga e olhos levemente mais baixos que nariz
        if nose and mouth_left and mouth_right and left_eye and right_eye:
            mouth_width = distance(mouth_left, mouth_right)
            eye_avg_y = (left_eye[1] + right_eye[1]) / 2
            mouth_center_y = (mouth_left[1] + mouth_right[1]) / 2
            eye_to_nose = eye_avg_y - nose[1]
            if mouth_width > 0.18 * w and 0.01 * h < eye_to_nose < 0.06 * h:
                activities.append("rindo")

        # Escrevendo: m√£os alinhadas verticalmente e pr√≥ximas abaixo do nariz
        if nose and left_wrist and right_wrist:
            if (abs(left_wrist[1] - right_wrist[1]) < 0.1 * h and
                is_aligned_vertically(left_wrist, right_wrist) and
                nose[1] + 0.05 * h < min(left_wrist[1], right_wrist[1])):
                activities.append("escrevendo")

        return activities

    def calculate_landmark_movement(self, current_landmarks, previous_landmarks):
        """
        Calcula movimento m√©dio ao quadrado entre dois conjuntos de landmarks.
        """
        if not current_landmarks or not previous_landmarks:
            return 0
        total_movement = 0
        count = 0
        for key in current_landmarks:
            if key in previous_landmarks:
                x1, y1 = current_landmarks[key]
                x2, y2 = previous_landmarks[key]
                dx = x1 - x2
                dy = y1 - y2
                total_movement += dx * dx + dy * dy
                count += 1
        return total_movement / count if count > 0 else 0

    def generate_summary(self, summary_path, video_name, processed_frames, anomalies_detected, emotion_summary, activity_summary):
        """
        Gera arquivo de resumo com estat√≠sticas e resumo textual.
        """
        with open(summary_path, "w", encoding="utf-8") as f:
            f.write("=== Resumo da An√°lise de V√≠deo ===\n")
            f.write(f"V√≠deo analisado: {video_name}\n")
            f.write(f"Total de frames processados: {processed_frames}\n")
            f.write(f"N√∫mero de anomalias detectadas: {anomalies_detected}\n\n")

            f.write("Distribui√ß√£o de emo√ß√µes:\n")
            for emotion, count in emotion_summary.most_common():
                f.write(f"  {emotion}: {count}\n")

            f.write("\nAtividades detectadas:\n")
            for activity, count in activity_summary.most_common():
                f.write(f"  {activity}: {count}\n")

            # Gera prompt para resumo
            emotion_list = ", ".join(f"{k} ({v})" for k, v in emotion_summary.most_common(5))
            activity_list = ", ".join(f"{k} ({v})" for k, v in activity_summary.most_common(5))
            prompt = f"Durante o v√≠deo foram detectadas as seguintes emo√ß√µes: {emotion_list}. As atividades mais comuns foram: {activity_list}."

            try:
                resumo = self.summarizer(prompt, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            except Exception as e:
                print(f"Erro ao gerar resumo autom√°tico: {e}")
                resumo = "Resumo autom√°tico n√£o dispon√≠vel."

            f.write("\nResumo em linguagem natural:\n")
            f.write(resumo + "\n")

    def process_video(self):
        """
        Loop principal de processamento do v√≠deo:
        - Captura frame a frame
        - Aplica an√°lise de emo√ß√µes, atividades e movimentos
        - Gera v√≠deo anotado e resumo final
        """
        try:
            cap = cv2.VideoCapture(self.VIDEO_PATH)
            if not cap.isOpened():
                raise IOError(f"Erro ao abrir o v√≠deo: {self.VIDEO_PATH}")

            fps = int(cap.get(cv2.CAP_PROP_FPS))
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter(self.OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            activity_summary = Counter()
            emotion_summary = Counter()
            anomalies_detected = 0
            last_landmarks = None
            frame_area = width * height

            for i in tqdm(range(frame_count), desc="üîç Processando v√≠deo"):
                ret, frame = cap.read()
                if not ret:
                    break

                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = self.pose.process(rgb_frame)

                current_landmarks = self.extract_landmarks(results, frame.shape)

                overlay_text = ""
                is_anomalous = False
                activities = []

                if i % self.FRAME_SKIP == 0:
                    small_frame = cv2.resize(frame, (0, 0), fx=self.RESIZE_SCALE, fy=self.RESIZE_SCALE)
                    anomalies_detected, frame = self.analyze_emotions(
                        small_frame, frame, emotion_summary, anomalies_detected, frame_area)

                if current_landmarks:
                    activities = self.detect_activity_from_landmarks(current_landmarks, frame.shape[:2])
                    for act in activities:
                        activity_summary[act] += 1

                movement_score = 0
                if current_landmarks and last_landmarks:
                    movement_score = self.calculate_landmark_movement(current_landmarks, last_landmarks)

                if movement_score > self.ANOMALY_MOVEMENT_THRESHOLD and not activities:
                    is_anomalous = True
                    anomalies_detected += 1
                    overlay_text += "‚ö†Ô∏è Movimento an√¥malo detectado\n"
                elif activities:
                    overlay_text += "Atividades: " + ", ".join(activities) + "\n"

                if (movement_score > self.ANOMALY_MOVEMENT_THRESHOLD or activities) and results.pose_landmarks:
                    self.mp_drawing.draw_landmarks(frame, results.pose_landmarks, self.mp_pose.POSE_CONNECTIONS)

                last_landmarks = current_landmarks

                if overlay_text:
                    y0 = 30
                    for idx, line in enumerate(overlay_text.strip().split('\n')):
                        y = y0 + idx * 30
                        frame = put_text_accented(frame, line, (30, y), font_path=FONT_PATH, font_size=22, color=(0, 0, 255))

                out.write(frame)

            cap.release()
            out.release()

            self.generate_summary(self.SUMMARY_PATH, self.VIDEO_PATH, frame_count, anomalies_detected, emotion_summary, activity_summary)

            print(f"‚úÖ An√°lise conclu√≠da. {anomalies_detected} anomalias detectadas.")
            print(f"Resumo salvo em '{self.SUMMARY_PATH}', v√≠deo anotado em '{self.OUTPUT_VIDEO_PATH}'.")

        finally:
            self.pose.close()

## üöÄ Execu√ß√£o principal
Instancia a classe e inicia o processamento completo do v√≠deo.

In [6]:
if __name__ == "__main__":
    analyzer = VideoAnalyzer()
    analyzer.process_video()

Device set to use cuda:0


‚úÖ GPU dispon√≠vel...


üîç Processando v√≠deo: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3326/3326 [56:51<00:00,  1.03s/it] 


‚úÖ An√°lise conclu√≠da. 1430 anomalias detectadas.
Resumo salvo em 'video_summary.txt', v√≠deo anotado em 'output_video.mp4'.
