In [None]:
# Standard library imports
import os
import gc

# Third-party imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from transformers import BertModel, BertTokenizer, CamembertModel, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import cv2
from matplotlib import pyplot as plt
from pydub import AudioSegment
import librosa
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.models import Model

# Local application imports
import load_data

In [None]:
# Define the path to the transcription files
transcr_path = 'paco-cheese/transcr'

# Load and preprocess all the Inter-Pausal Units (IPUs) from the transcription files
data = load_data.load_all_ipus(folder_path=transcr_path, load_words=True)

In [None]:
# Load the data into a DataFrame
df = pd.DataFrame(data)

# Create a new list 'y' to store 1s and 0s based on speaker change
y = [0]  # Initialize the first element to 0, as there's no previous speaker to compare with.

# Iterate over the DataFrame, starting from the second row
for i in range(0, len(df)-1):
    # Check if the current speaker is different from the previous one
    if df['speaker'][i] != df['speaker'][i+1]:
        y.append(1)  # Speaker changed
    else:
        y.append(0)  # Speaker did not change

# Print the list 'y'
y

# Get the indices of the speaker changes
indices = [i for i, x in enumerate(y) if x == 1]

# Print the number of speaker changes and their indices
print(len(indices))
print(indices)

# Print the speakers at the indices of speaker changes
for i in indices:
    print(df['speaker'][i])

# Display the last 5 rows of the DataFrame
df[len(indices)-5:]

# Filter the DataFrame for rows where the speaker is 'LS'
ls = df[df['speaker']=='LS']

# Display the filtered DataFrame
ls

# Check if there are any rows where the speaker is NaN
df[df['speaker'].isnull()]

In [None]:
# df_save=df.copy()

In [None]:
# Filter the DataFrame 'df_save' to include only the rows where the 'dyad' column is 'transcr\AAOR'
df = df_save[df_save['dyad'] == 'transcr\\AAOR']

# Print the shape of the filtered DataFrame to see the number of rows and columns
df.shape

# Process videos

### obtenirs les paths

In [None]:
# Define a function to find the video file for a given dyad and first speaker
def find_video_file(dyad, first_speaker):
    # Extract the dyad name from the path
    dyad = dyad.split('\\')[1]
    
    # If the first speaker is NaN, replace it with "NA"
    if isinstance(first_speaker, float):
        first_speaker = "NA"

    # Determine the second speaker by removing the first speaker from the dyad name
    second_speaker = dyad.replace(first_speaker, "")

    # Define the subdirectories to search in
    subdirs = ['cheese', 'paco']
    
    # Loop over the subdirectories
    for subdir in subdirs:
        # Define the path to the media files
        media_files_path = f'paco-cheese/video/video/{subdir}/'
        
        # Loop over the files in the media files path
        for file_name in os.listdir(media_files_path):
            # If the file name contains both the first and second speaker, return the file path
            if first_speaker in file_name and second_speaker in file_name:
                return os.path.join(media_files_path, file_name)

    # If no file is found, return None
    return None

# Loop over the rows in the DataFrame
for index, row in df.iterrows():
    # Get the first speaker from the row, or "NA" if it's NaN
    first_speaker = str(row['speaker']) if not pd.isna(row['speaker']) else "NA"
    
    # Find the video file for the dyad and first speaker
    video_path = find_video_file(row['dyad'], first_speaker)

    # Print the video path
    print(video_path)

### Extraires les differentes features des videos :

In [None]:
# Define a function to extract segments from a video
def extract_video_segments(video_file_path, start_ms, end_ms):
    # Open the video file
    cap = cv2.VideoCapture(video_file_path)
    # Initialize a list to store the frames
    frames = []
    # Get the frames per second (fps) of the video
    fps = cap.get(cv2.CAP_PROP_FPS)
    # Calculate the start and end frames
    start_frame = int(start_ms * fps / 1000)
    end_frame = int(end_ms * fps / 1000)
    
    # Initialize the current frame number
    current_frame = 0
    # Loop over the frames in the video
    while cap.isOpened():
        # Read the next frame
        ret, frame = cap.read()
        # If the frame could not be read or the current frame is past the end frame, break the loop
        if not ret or current_frame > end_frame:
            break
        # If the current frame is within the segment, append it to the list
        if current_frame >= start_frame:
            frames.append(frame)
        # Increment the current frame number
        current_frame += 1
    # Release the video file
    cap.release()
    # Return the list of frames
    return frames

# Define a function to preprocess a frame
def preprocess_frame(frame):
    # Resize the frame to 224x224 pixels for VGG16
    frame_resized = cv2.resize(frame, (224, 224))
    return frame_resized

# Define a class for feature extraction
class FeatureExtractor:
    def __init__(self):
        # Load the VGG16 model with pre-trained weights
        base_model = VGG16(weights='imagenet')
        # Create a new model that outputs the features from the 'fc1' layer of the base model
        self.model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

    def extract(self, frame):
        # Expand the dimensions of the frame
        frame = np.expand_dims(frame, axis=0)
        # Preprocess the frame
        frame = preprocess_input(frame)
        # Extract the features from the frame
        features = self.model.predict(frame)
        # Remove the extra dimension from the features
        return features.squeeze()

# Define a function to fuse the features
def fuse_features(features):
    # If there are any features, return their mean; otherwise, return a zero vector
    return np.mean(features, axis=0) if features else np.zeros((512,))

# Initialize the feature extractor
feature_extractor = FeatureExtractor()

# Initialize a list to store the video features
X_video = []

# Loop over the rows in the DataFrame
for index, row in df.iterrows():
    # Get the first speaker from the row, or "NA" if it's NaN
    first_speaker = str(row['speaker']) if not pd.isna(row['speaker']) else "NA"
    # Find the video file for the dyad and first speaker
    video_file_path = find_video_file(row['dyad'], first_speaker)

    # If the video file was found
    if video_file_path:
        print(f"Processing video file: {video_file_path}")
        # Calculate the start and end times in milliseconds
        start_ms = int(row['start_ipu'] * 1000)
        end_ms = int(row['stop_ipu'] * 1000)
        # Extract the frames from the video segment
        frames = extract_video_segments(video_file_path, start_ms, end_ms)

        # If no frames were extracted
        if not frames:
            print("No frames extracted from video file.")
            X_video.append(np.zeros((512,)))
            continue
        # Display each frame
        for frame in frames:
            plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            plt.show()

        print(f"Number of frames extracted: {len(frames)}")
        # Preprocess each frame
        frames = [preprocess_frame(frame) for frame in frames]
        # Extract the features from each frame
        features = extract_video_features(frames, feature_extractor)
        # Fuse the features
        video_features = fuse_features(features)
        # Append the features to the list
        X_video.append(video_features)
        print(f"Video features extracted: {video_features}")
    else:
        print(f"No video file path found for {row['dyad']} and {first_speaker}")
        X_video.append(np.zeros((512,)))

# Convert the list of video features to a NumPy array
X_video = np.array(X_video)

# Process des audios

In [None]:
# Define the path to the audio files
audio_files_path = 'paco-cheese/audio/2_channels/'

# Define a function to find the audio file for a given dyad and first speaker
def find_audio_file(dyad, first_speaker):
    # Extract the dyad name from the path
    dyad = dyad.split('\\')[1]

    #first_speaker=str(first_speaker)

    # If the first speaker is NaN, replace it with "NA"
    if isinstance(first_speaker, float):
        first_speaker = "NA"
    # Determine the second speaker by removing the first speaker from the dyad name
    second_speaker = dyad.replace(first_speaker, "")
    # Loop over the files in the audio files path
    for file_name in os.listdir(audio_files_path):
        # If the file name contains both the first and second speaker, return the file path
        if first_speaker in file_name and second_speaker in file_name:
            return os.path.join(audio_files_path, file_name)
    # If no file is found, return None
    return None

# Define a function to extract audio segments from a DataFrame
def extract_audio_segments(df):
    # Initialize a list to store the audio segments
    audio_segments = []
    # Initialize the audio file path
    audio_file_path = ""
    # Loop over the rows in the DataFrame
    for index, row in df.iterrows():
        # Get the first speaker from the row, or "NA" if it's NaN
        first_speaker = str(row['speaker']) if not pd.isna(row['speaker']) else "NA"
        # If the first speaker is not in the audio file path
        if first_speaker not in audio_file_path:
            # If the audio file has not been loaded yet, load it
            audio = None
            gc.collect()
            audio_file_path = find_audio_file(row['dyad'], first_speaker)
            if audio_file_path is None:
                print("Audio file not found for dyad {}".format(row['dyad']))
                audio_file_path = ""
                continue
            audio = AudioSegment.from_file(audio_file_path)
        # If the audio file path is not empty
        if audio_file_path != "":
            # Calculate the start and end times in milliseconds
            start_ms = int(row['start_ipu'] * 1000)
            end_ms = int(row['stop_ipu'] * 1000)
            # Extract the audio segment
            segment = audio[start_ms:end_ms]
            # Append the segment to the list
            audio_segments.append(segment)
    # Return the list of audio segments
    return audio_segments

# Use the function to extract audio segments from your DataFrame
audio_segments = extract_audio_segments(data)

# Define a function to extract features from an audio segment
def extract_features(audio_segment):
    # Convert the PyDub audio segment to a numpy array
    samples = np.array(audio_segment.get_array_of_samples())
    # Normalize the audio samples to floating-point values
    if audio_segment.sample_width == 2:
        samples = samples.astype(np.float32) / 32768
    elif audio_segment.sample_width == 4:
        samples = samples.astype(np.float32) / 2147483648
    # Use librosa to extract MFCCs
    mfccs = librosa.feature.mfcc(y=samples, sr=audio_segment.frame_rate, n_mfcc=13)
    # Average the MFCCs over time
    mfccs_mean = np.mean(mfccs, axis=1)
    # Return the mean MFCCs
    return mfccs_mean

In [None]:
#Display
print(len(audio_segments))
print(len(y))

In [None]:
#************************************** A CHANGER A LA FIN

# Extract in a list the MFCC features for all audio segments, and convert the list to a numpy array
X_audio = np.array([extract_features(segment) for segment in audio_segments])

In [None]:
#************************************** A CHANGER A LA FIN

# Save the current DataFrame 'df'
df_save = df

# Save the current audio features array 'X_audio'
X_audio_save = X_audio

# Save the current target variable array 'y'
y_save = y

In [None]:
#************************************** A CHANGER A LA FIN

# Set a limit for the number of samples to use for testing
limit = 1000  #107603  #110544
X_audio = X_audio_save[:limit]
y = y_save[:limit]
df = df_save[:limit]

# Nouveau modele, test

In [None]:
# Load the textual data and extract the features
# Make sure the 'data' variable is loaded as in your previous script
text_features = data['text_words'][:limit]

# Replace NaN values with a placeholder
text_features = text_features.fillna('[UNK]')

# Use CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Define a function to tokenize and pad the text to a maximum length of 256
def tokenize_and_pad(text, max_len=256):
    return tokenizer.encode(text, max_length=max_len, padding='max_length', truncation=True)

# Apply the tokenizer to all textual data
text_features = text_features.apply(lambda x: tokenize_and_pad(str(x)))

# Make sure 'X' is your extracted audio features matrix
audio_features = torch.tensor(X_audio)

# Convert the textual features into a tensor
text_features = torch.tensor(np.array(text_features.tolist()))

# Merge the audio and textual features
combined_features = torch.cat((audio_features, text_features), dim=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.3, random_state=42)

In [None]:
# Display the class distribution in the training and test sets

# Count the number of each class for y_train and y_test
train_class_distribution = pd.Series(y_train).value_counts()
test_class_distribution = pd.Series(y_test).value_counts()

# Display
print("Distribution of classes in the training set:")
print(train_class_distribution)
print("Distribution of classes in the test set:")
print(test_class_distribution)

In [None]:
# Define an early fusion model for audio and text features
class EarlyFusionModel(nn.Module):
    def __init__(self, audio_feature_size, text_feature_size, dropout_rate=0.5):
        super(EarlyFusionModel, self).__init__()
        # CamemBERT for text features
        self.camembert = CamembertModel.from_pretrained('camembert-base')
        # Layers for audio features
        self.audio_fc1 = nn.Linear(audio_feature_size, 128)
        self.audio_fc2 = nn.Linear(128, 64)
        # Fusion and final layers
        self.fc1 = nn.Linear(64 + text_feature_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, audio_features, text_features):
        # Process audio features
        audio_x = F.relu(self.audio_fc1(audio_features))
        audio_x = F.relu(self.audio_fc2(audio_x))
        # Process text features
        text_x = self.camembert(text_features)[1]
        # Fuse audio and text features
        combined = torch.cat((audio_x, text_x), dim=1)
        # Final layers
        x = self.dropout(F.relu(self.fc1(combined)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = torch.sigmoid(self.fc3(x))
        return x

# Define a dataset for combined audio and text features
class CombinedDataset(Dataset):
    def __init__(self, audio_features, text_features, labels):
        self.audio_features = audio_features
        self.text_features = text_features
        self.labels = labels

    def __len__(self):
        return len(self.audio_features)

    def __getitem__(self, idx):
        return self.audio_features[idx], self.text_features[idx], self.labels[idx]

# Determine the size of the audio and text features
audio_feature_size = X_audio.shape[1]
temp_camembert = CamembertModel.from_pretrained('camembert-base')
text_feature_size = temp_camembert.config.hidden_size

# Split the data into training and test sets
X_audio_train, X_audio_test, text_features_train, text_features_test, y_train, y_test = train_test_split(X_audio, text_features, y, test_size=0.3, random_state=42)

# Create DataLoaders for the training and test sets
train_dataset = CombinedDataset(torch.tensor(X_audio_train), torch.tensor(text_features_train), torch.tensor(y_train))
test_dataset = CombinedDataset(torch.tensor(X_audio_test), torch.tensor(text_features_test), torch.tensor(y_test))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Initialize the early fusion model and move it to the GPU if available
model = EarlyFusionModel(audio_feature_size, text_feature_size).to(device)

# Define the weights for the positive and negative classes for the loss function
# Currently, the weight for the positive class is set to 1 for testing purposes
w_pos = 1
class_weights = torch.tensor([1.0, w_pos]).to(device)

# Define the loss function, using Binary Cross Entropy with Logits Loss
# The weight for the positive class is applied
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])

# Define the optimizer, using Adam with a learning rate of 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Set the number of training epochs
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (audio_inputs, text_inputs, labels) in enumerate(train_loader):
        # Move the inputs and labels to the GPU if available
        audio_inputs, text_inputs, labels = audio_inputs.to(device), text_inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(audio_inputs, text_inputs)

        # Compute the loss
        loss = criterion(outputs.squeeze(), labels.float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss over the epoch
        running_loss += loss.item()

    # Calculate and print the average loss over the epoch
    avg_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

# Evaluation
# Initialize counters and lists for accuracy, F1 score, and confusion matrix calculation
correct = 0
total = 0
all_preds = []
all_labels = []

# Switch the model to evaluation mode
model.eval()

# No gradient calculation during evaluation
with torch.no_grad():
    for audio_inputs, text_inputs, labels in test_loader:
        # Move the inputs and labels to the GPU if available
        audio_inputs, text_inputs, labels = audio_inputs.to(device), text_inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(audio_inputs, text_inputs)

        # Apply a sigmoid activation to the outputs and threshold at 0.5 for binary classification
        predicted = torch.sigmoid(outputs).squeeze() > 0.5

        # Update the counters for accuracy calculation
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Append the predictions and labels for F1 score and confusion matrix calculation
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate and print the accuracy
accuracy = correct / total
print(f'Accuracy: {accuracy}')

# Calculate and print the F1 score
f1 = f1_score(all_labels, all_preds)
print(f'F1 Score: {f1}')

# Calculate and print the confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print('Confusion Matrix:')
print(conf_matrix)