In [1]:
import io
import os
import av
import cv2
import torch
import tempfile
import requests
import torchaudio
import numpy as np
import pandas as pd
from deepface import DeepFace
from typing import List, Dict
from pydub import AudioSegment
from dotenv import load_dotenv
import torch.nn.functional as f
from supabase import create_client, Client
from transformers import (AutoModelForAudioClassification, Wav2Vec2FeatureExtractor)




In [2]:
load_dotenv()

True

CONFIG VARIABLES

In [3]:
session_id = 14
interview_id = 24
bucket_name = 'interviews'

supabase_client = create_client('https://kglmfklezrjwfvtcolgb.supabase.co', os.environ.get('SUPABASE_KEY'))
supabase: Client = create_client('https://kglmfklezrjwfvtcolgb.supabase.co', os.environ.get('SUPABASE_KEY'))
supabase_connection = supabase_client.storage.from_('interviews')

DIARIZATION_API_URL = 'https://transcribe.whisperapi.com'
STT_API_URL = 'https://api.lemonfox.ai/v1/audio/transcriptions'

headers = {
  'Authorization': 'Bearer {}'.format(os.environ.get('WHISPER_API_KEY'))
}

Current method save_results_to_bd

In [127]:
def save_results_to_bd(results: pd.DataFrame) -> None:
    try:
        response = supabase.table('interviews').select('user_id').eq('id', interview_id).execute()
        user_id = response.data[0]['user_id']

        results['interview_id'] = interview_id
        results['user_id'] = user_id
        results = results.fillna('')

        data_to_insert = results.to_dict(orient='records')

        response = supabase.table('results').insert(data_to_insert).execute()
        print('{} lines saved to the database successfully'. format(len(response.data)))
    except Exception as e:
        print('Error saving results to the database', str(e))

Current method save df to bd

In [128]:
def save_results_to_bd(results: pd.DataFrame) -> None:
    try:
        response = supabase.table('interviews').select('user_id').eq('id', interview_id).execute()
        user_id = response.data[0]['user_id']

        results['interview_id'] = interview_id
        results['user_id'] = user_id
        results = results.fillna('')

        data_to_insert = results.to_dict(orient='records')

        response = supabase.table('results').insert(data_to_insert).execute()
        print('{} lines saved to the database successfully'. format(len(response.data)))
    except Exception as e:
        print('Error saving results to the database', str(e))

Current method open_input_file (to modify)

In [129]:
def diarize(audio: bytes) -> pd.DataFrame:
    file = {'file': io.BytesIO(audio)}
    data = {'num_speakers': '2',
            'language': 'french',
            'diarization': 'true',
            'task': 'transcribe',
            }
    headers = {'Authorization': 'Bearer {}'.format(os.environ.get('WHISPER_API_KEY'))}

    response = requests.post(DIARIZATION_API_URL, headers=headers, data=data, files=file)
    df = pd.DataFrame(response.json()['diarization'])
    
    df.rename(columns={'startTime': 'start', 'stopTime': 'end'}, inplace=True)
    df['start'] = df['start'].map(lambda x: int(x * 1000))
    df['end'] = df['end'].map(lambda x: int(x * 1000))
    df['speaker'] = df['speaker'].map(lambda x: int(x.split('_')[1]))
    return df

In [130]:
def speech_to_text(audio_bytes: bytes, diarization: pd.DataFrame) -> pd.DataFrame:
    
    audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")
    headers = {'Authorization': 'Bearer {}'.format(os.environ.get('WHISPER_API_KEY'))}
    data = {'model': 'whisper-1',
            'language': 'fr',
            'response_format': 'text'
            }
    
    for row in diarization.itertuples():
        audio_segment = audio[row.start:row.end]
        audio_segment_bytes = io.BytesIO()
        audio_segment.export(audio_segment_bytes, format="mp3")
        audio_segment_bytes.seek(0)
        
        file = {'file': audio_segment_bytes}
        response = requests.post(STT_API_URL, headers=headers, files=file, data=data)
        if response.status_code == 200:
            diarization.at[row.Index, 'text'] = response.json()
            
    return diarization

In [131]:
s3_path = '{}/{}/raw/raw.mp3'.format(session_id, interview_id)
audio_bytes = supabase_connection.download(s3_path)

diarization = diarize(audio_bytes)
df_full = speech_to_text(audio_bytes, diarization)


In [132]:
df_full

Unnamed: 0,start,end,speaker,text
0,314,4083,1,Première question qu'on nous a énormément posé...
1,4694,6833,1,c'est comment on s'est rencontrés ?
2,7359,8904,1,où on s'est rencontrés.
3,9821,12368,0,"Alors moi, je vivais à Montréal."
4,12792,19516,0,"Donc on était en 2016, donc de 2016 à 2017 je ..."
5,20093,21825,0,Et j'ai eu une opportunité.
6,22351,25899,0,de revenir en France pour travailler.
7,27003,31519,0,"Donc je quitte Montréal, on est en août 2017."
8,32113,35747,0,et je quitte Montréal pour venir travailler à ...
9,36239,38225,0,"dans une entreprise, dans une start-up,"


In [None]:
save_results_to_bd(df_full)

In [12]:
res = supabase.table('results').select('text', 'id').eq('interview_id', interview_id).eq('speaker', 0).execute()
results = pd.DataFrame(res.data)
results.set_index('id', inplace=True)
results

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1052,"Alors moi, je vivais à Montréal."
1053,"Donc on était en 2016, donc de 2016 à 2017 je ..."
1054,Et j'ai eu une opportunité.
1055,de revenir en France pour travailler.
1056,"Donc je quitte Montréal, on est en août 2017."
1057,et je quitte Montréal pour venir travailler à ...
1058,"dans une entreprise, dans une start-up,"
1059,à Paris.
1060,"Donc, mon premier jour de travail arrive et j'..."
1061,entre plusieurs personnes mais


## AUDIO

In [4]:
def open_input_file(s3_path: str, file_name: str) -> bytes | None:
    try:
        print('Getting file {} from the S3 bucket'.format(file_name))
        file_bytes = supabase_connection.download(s3_path)
        return file_bytes
    except Exception as e:
        message = ('Error downloading the file {} from the S3 bucket. '.
                    format(file_name), str(e))
        print(message)
        raise e

In [28]:
def get_segments_from_db() -> pd.DataFrame:
    res = (supabase.table('results').select('id', 'start', 'end')
            .eq('interview_id', interview_id)
            .eq('speaker', 0)
            .execute())
    results = pd.DataFrame(res.data)
    results.set_index('id', inplace=True)
    return results

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [15]:
ate_model = AutoModelForAudioClassification.from_pretrained('Lajavaness/wav2vec2-lg-xlsr-fr-speech-emotion-recognition')
ate_model.to(device)
ate_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('Lajavaness/wav2vec2-lg-xlsr-fr-speech-emotion-recognition')
ate_sampling_rate = ate_feature_extractor.sampling_rate

In [18]:
filename = 'raw.mp3'
s3_path = '{}/{}/raw/{}'.format(session_id, interview_id, filename)
print(s3_path)
audio_bytes = open_input_file(s3_path, filename)
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")

14/31/raw/raw.mp3
Getting file raw.mp3 from the S3 bucket


In [23]:
segments = get_segments_from_db()

In [24]:
sentiments = list()

for row in segments.itertuples():
    audio_segment = audio[row.start:row.end]
    audio_segment_bytes = io.BytesIO()
    audio_segment.export(audio_segment_bytes, format="mp3")
    audio_segment_bytes.seek(0)
    
    speech_array, sample_rate = torchaudio.load(audio_segment_bytes)
    resampler = torchaudio.transforms.Resample(sample_rate, ate_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()

    inputs = ate_feature_extractor(speech, sampling_rate=ate_sampling_rate,
                                                return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}
    
    with torch.no_grad():
        logits = ate_model(**inputs).logits

    scores = f.softmax(logits, dim=1).detach().cpu().numpy()[0]

    # Get the percentage scores and round them to 5 decimal places
    scores = [round(num * 100, 5) for num in scores]

    # Get a dictionary with the labels for each emotion and its values
    values_dict = dict(zip(ate_model.config.id2label.values(), scores))

    # Sort the dictionary by values in descending order
    sorted_values = {k: v for k, v in sorted(values_dict.items(), key=lambda x: x[1], reverse=True)}
    
    sentiments.append(sorted_values)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [27]:
segments['audio_emotions'] = sentiments
segments

Unnamed: 0_level_0,start,end,audio_emotions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1052,9821,12368,"{'Neutral': 99.97324, 'Relaxed': 0.01292, 'Ple..."
1053,12792,19516,"{'Neutral': 99.99914, 'Relaxed': 0.00026, 'Ple..."
1054,20093,21825,"{'Neutral': 99.99955, 'Sad': 0.00025, 'Tension..."
1055,22351,25899,"{'Neutral': 99.99902, 'Pleased': 0.00031, 'Rel..."
1056,27003,31519,"{'Neutral': 99.99893, 'Sad': 0.00056, 'Relaxed..."
1057,32113,35747,"{'Neutral': 99.99925, 'Sad': 0.00028, 'Relaxed..."
1058,36239,38225,"{'Neutral': 99.99923, 'Sad': 0.00032, 'Tension..."
1059,38650,39278,"{'Neutral': 99.99903, 'Sad': 0.00039, 'Relaxed..."
1060,40382,46918,"{'Neutral': 99.99934, 'Sad': 0.00019, 'Relaxed..."
1061,47869,51044,"{'Neutral': 99.99785, 'Relaxed': 0.00116, 'Sad..."


## VIDEO

In [9]:
def predict(image: np.ndarray) -> Dict[str, float]:
    try:
        objs = DeepFace.analyze(image, actions=['emotion'])
        results = objs[0]['emotion']
        emotions = {k: v for k, v in sorted(results.items(), key=lambda x: x[1], reverse=True)}
    except ValueError:
        emotions = {'No face detected': 0.0}
    return emotions

In [6]:
def process(segments: pd.DataFrame) -> List[List[Dict[str, float]]]:
    all_sentiments = list()
    
    print('Processing sentiments from video')

    filename = 'raw.mp4'
    s3_path = '{}/{}/raw/{}'.format(session_id, interview_id, filename)
    print(s3_path)
    video_bytes = open_input_file(s3_path, filename)

    with (tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file):
        temp_file_path = temp_file.name
        try:
            temp_file.write(video_bytes)
            clip = cv2.VideoCapture(temp_file_path)

            # Get video fps
            fps = clip.get(cv2.CAP_PROP_FPS)

            # Set the interval for extracting frames
            timing = 2.0
            interval = int(fps) * timing

            for row in segments.itertuples():
                sentiments = list()

                # Calculate frame indices for starting and ending times
                start_frame = int(row.start / 1000 * fps)
                end_frame = int(row.end / 1000 * fps)

                # Set starting frame
                clip.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

                # Read the video frame by frame and send respective frames to prediction
                frame_count = 0
                image_count = 0
                while clip.isOpened() and frame_count <= (end_frame - start_frame):
                    ret, frame = clip.read()

                    # If there are no more frames, break the loop
                    if not ret:
                        break

                    # Detect emotions from the frame if it's the first one or if it's a multiple of the interval
                    if frame_count == 0 or frame_count % interval == 0:
                        image_name = 'image_{:05d}'.format(image_count)
                        image_count += 1

                        sentiments.append({image_name: predict(frame)})

                    # Save the last frame
                    elif start_frame + frame_count == end_frame:
                        image_name = 'image_{:05d}'.format(image_count)
                        image_count += 1

                        sentiments.append({image_name: predict(frame)})

                    frame_count += 1

                all_sentiments.append(sentiments)
            
            # Release the video capture object
            clip.release()
        except Exception as e:
            message = ('Error processing video emotions.', str(e))
            print(message)
        finally:
            temp_file.close()
            # Clean up the temporary file
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)

        print('Emotions extraction from video have finished')
        return all_sentiments

In [7]:
segments = get_segments_from_db()
res = process(segments)

segments['video_emotions'] = res
segments

NameError: name 'get_segments_from_db' is not defined

In [6]:
filename = 'raw.mp4'
s3_path = '{}/{}/raw/{}'.format(session_id, interview_id, filename)
print(s3_path)
video_bytes = open_input_file(s3_path, filename)

14/31/raw/raw.mp4
Getting file raw.mp4 from the S3 bucket


In [10]:
import av

# Assuming `video_bytes` is your array of bytes
container = av.open(io.BytesIO(video_bytes))

In [16]:
stream = container.streams.video[0]
fps = stream.average_rate
print(fps)

30


In [17]:
start_time = 10  # start at 10 seconds
end_time = 20    # end at 20 seconds

# Seek to the start time
container.seek(int(start_time * av.time_base))

## NEW TEST VIDEO

In [125]:
timing = 2.0

In [135]:
def new_process(segments: pd.DataFrame) -> List[List[Dict[str, float]]]:
    all_sentiments = list()
    
    print('Processing sentiments from video')

    # filename = 'raw.mp4'
    # s3_path = '{}/{}/raw/{}'.format(session_id, interview_id, filename)
    # video_bytes = open_input_file(s3_path, filename)
    
    container = av.open(io.BytesIO(video_bytes))
    
    for row in segments.itertuples():
        sentiments = list()
        image_count = 0
        
        start_time = row.start / 1000
        end_time = row.end / 1000
        
        container.seek(int(start_time * av.time_base))
        # print('Clip start time:', start_time, 'Clip end time:', end_time)
        
        # Flag to ensure end_time frame is captured
        end_frame_captured = False
        
        # Initialize variables to track frame extraction
        last_extracted_time = start_time - timing  # ensures the first frame is extracted at start_time
        
        for frame in container.decode(video=0):
            frame_time = frame.time
        
            if frame_time < start_time:
                continue
            
            # Extract the last frame at end_time
            if frame_time >= end_time and not end_frame_captured:                
                img = frame.to_image()
                img_array = np.array(img)
                sentiments.append({'frame_{:05d}'.format(image_count): predict(img_array)})
                image_count += 1
                
                end_frame_captured = True
                # print('Last frame captured: ', frame_time)
                break
            
            if frame_time >= start_time and frame_time >= last_extracted_time + timing and frame_time < end_time:
                img = frame.to_image()
                img_array = np.array(img)
                sentiments.append({'frame_{:05d}'.format(image_count): predict(img_array)})
                image_count += 1
                # print('Capturing frame ', frame_time)
                
                # Update the last extracted time
                last_extracted_time = frame_time
                   
        all_sentiments.append(sentiments)

    container.close()
    # cv2.destroyAllWindows()
    
    return all_sentiments


In [133]:
segments = get_segments_from_db()

In [136]:
res = new_process(segments)
res

Processing sentiments from video


[[{'frame_00000': {'neutral': 78.19947004318237,
    'happy': 15.301547944545746,
    'fear': 3.052571602165699,
    'angry': 1.927490159869194,
    'sad': 1.5153182670474052,
    'surprise': 0.003532637856551446,
    'disgust': 7.304662403839757e-05}},
  {'frame_00001': {'neutral': 85.57081818580627,
    'sad': 8.659198135137558,
    'fear': 3.148382157087326,
    'angry': 1.8813705071806908,
    'happy': 0.4544760100543499,
    'surprise': 0.2855287166312337,
    'disgust': 0.0002256660991406534}}],
 [{'frame_00000': {'sad': 79.05328273773193,
    'fear': 14.143165946006775,
    'neutral': 6.61165714263916,
    'angry': 0.14851256273686886,
    'happy': 0.043335300870239735,
    'surprise': 4.749843753870664e-05,
    'disgust': 7.455372430520413e-08}},
  {'frame_00001': {'fear': 75.58532357215881,
    'neutral': 19.58324909210205,
    'sad': 4.5829176902771,
    'angry': 0.23604626767337322,
    'surprise': 0.012302663526497781,
    'happy': 0.0001535157139187504,
    'disgust': 5.57

# TEST GCLOUD 

In [None]:
import base64
from google.cloud import aiplatform
from google.cloud import storage
from ultralytics import YOLO

In [3]:
def download_model_file():
    bucket_name = 'last_model'
    model_name = 'model.pt'
    """
    Download the model .pt weights file.
    """
    # bucket_name, artifacts_path = extract_from_uri(uri)
    # logger.info(f"Downloading model file from bucket: {bucket_name} and artifact path: {artifacts_path}")
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(model_name)
    blob.download_to_filename(model_name)
    
    model = YOLO(model_name)
    return model
    

In [4]:
model = download_model_file()

In [5]:
type(model)

ultralytics.models.yolo.model.YOLO

In [50]:
def predict(image: np.ndarray) -> Dict[str, float]:
    try:
        objs = DeepFace.analyze(image, actions=['emotion'])
        results = objs[0]['emotion']
        emotions = {k: v for k, v in sorted(results.items(), key=lambda x: x[1], reverse=True)}
    except ValueError:
        emotions = {'No face detected': 0.0}
    return emotions

def predict2(image: np.ndarray, predict_endpoint: aiplatform.Endpoint) -> Dict[str, float] | None:
    try:
        _, buffer = cv2.imencode('.jpg', image)
        image_bytes = base64.b64encode(buffer).decode('utf-8')

        # objs = DeepFace.analyze(image, actions=['emotion'])
        response = predict_endpoint.predict(instances=[
            {
                "image": image_bytes
            }
        ])
        results = response.predictions[0]
        
        
        # results = objs[0]['emotion']
        emotions = {k: v*100 for k, v in sorted(results.items(), key=lambda x: x[1], reverse=True)}
        return emotions
    except Exception as e:
        print(e)
        # emotions = {'No face detected': 0.0}
    return emotions

In [20]:
aiplatform.init(project="annual-project-427112", location="europe-west1")
endpoint_name = None
for endpoint in aiplatform.Endpoint.list():
    if endpoint.display_name == "yolo_predict":
        endpoint_name = endpoint.name
predict_endpoint = aiplatform.Endpoint(endpoint_name=endpoint_name)

In [51]:
segments = get_segments_from_db()
timing = 2.0

all_sentiments = list()

print('Processing sentiments from video')

filename = 'raw.mp4'
s3_path = '{}/{}/raw/{}'.format(session_id, interview_id, filename)
video_bytes = open_input_file(s3_path, filename)

container = av.open(io.BytesIO(video_bytes))

for row in segments.itertuples():
    sentiments = list()
    image_count = 0
    
    start_time = row.start / 1000
    end_time = row.end / 1000
    
    container.seek(int(start_time * av.time_base))
    
    # Flag to ensure end_time frame is captured
    end_frame_captured = False
    
    # Initialize variables to track frame extraction
    last_extracted_time = start_time - timing  # ensures the first frame is extracted at start_time
    
    for frame in container.decode(video=0):
        frame_time = frame.time
    
        if frame_time < start_time:
            continue
        
        # Extract the last frame at end_time
        if frame_time >= end_time and not end_frame_captured:
            img = frame.to_image()
            img_array = np.array(img)
            sentiments.append({'frame_{:05d}'.format(image_count): predict(img_array)})
            image_count += 1
            
            end_frame_captured = True
            # print('Last frame captured: ', frame_time)
            break
        
        if frame_time >= start_time and frame_time >= last_extracted_time + timing and frame_time < end_time:
            img = frame.to_image()
            img_array = np.array(img)
            res1 = predict(img_array)
            res2 = predict2(img_array, predict_endpoint)
            print(res1)
            print(res2)
            break
            # sentiments.append({'frame_{:05d}'.format(image_count): predict(img_array)})
            # image_count += 1
            # print('Capturing frame ', frame_time)
            
            # Update the last extracted time
            last_extracted_time = frame_time
                
    # all_sentiments.append(sentiments)

container.close()
# cv2.destroyAllWindows()


Processing sentiments from video
Getting file raw.mp4 from the S3 bucket
400 Endpoint projects/492916107091/locations/europe-west1/endpoints/7866772600272191488 misconfigured, "traffic_split" not set.  Verify if any models are deployed to the endpoint and traffic split is configured for them.


UnboundLocalError: cannot access local variable 'emotions' where it is not associated with a value