In [None]:
# Standard library imports
import gc
import os

# Third-party imports
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from pydub import AudioSegment
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertModel, BertTokenizer, CamembertModel, CamembertTokenizer, get_linear_schedule_with_warmup

# Local application imports
import load_data
from training_audio_model import *
from training_text_model import *
from utils import *

# Set the device for GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())

In [None]:
# Define the path to the transcription files
transcr_path = 'paco-cheese/transcr'

# Load and preprocess all the Inter-Pausal Units (IPUs) from the transcription files
data = load_data.load_all_ipus(folder_path=transcr_path, load_words=True)

In [None]:
# Load the data into a DataFrame
df = pd.DataFrame(data)

# Create the target variable 'y' from the DataFrame
y = create_y(df)

# Get the indices of the positive samples in 'y'
indices = [i for i, x in enumerate(y) if x == 1]

# Print the number of positive samples and their indices
print(len(indices))
print(indices)

# Print the speaker corresponding to each positive sample
for i in indices:
    print(df['speaker'][i])

# Display the last 5 entries of the DataFrame
df[len(indices)-5:]

# Look for entries in the DataFrame where the speaker is 'LS'
ls = df[df['speaker'] == 'LS']
ls

# Check if there are any entries in the DataFrame where the speaker is missing
df[df['speaker'].isnull()]

In [None]:
# Define the path to the audio files
audio_files_path = 'paco-cheese/audio/2_channels/'

# Extract the audio features from the data
audio_segments = extract_audio_segments(data, audio_files_path)

In [None]:
# Display
print(len(audio_segments))
print(len(y))

In [None]:
#************************************** A CHANGER A LA FIN

# Extract in a list the MFCC features for all audio segments, and convert the list to a numpy array
X_audio = np.array([extract_features(segment) for segment in audio_segments])

In [None]:
#************************************** A CHANGER A LA FIN

# Save the current DataFrame 'df'
df_save = df

# Save the current audio features array 'X_audio'
X_audio_save = X_audio

# Save the current target variable array 'y'
y_save = y

In [None]:
#************************************** A CHANGER A LA FIN

# Set a limit for the number of samples to use for testing
limit = 1000  #107603  #110544
X_audio = X_audio_save[:limit]
y = y_save[:limit]
df = df_save[:limit]

# Nouveau modele, test

In [None]:
# Load the textual data and extract the features
# Make sure the 'data' variable is loaded as in your previous script
text_features = data['text_words'][:limit]

# Replace NaN values with a placeholder
text_features = text_features.fillna('[UNK]')

# Use CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Define a function to tokenize and pad the text to a maximum length of 256
def tokenize_and_pad(text, max_len=256):
    return tokenizer.encode(text, max_length=max_len, padding='max_length', truncation=True)

# Apply the tokenizer to all textual data
text_features = text_features.apply(lambda x: tokenize_and_pad(str(x)))

# Make sure 'X' is your extracted audio features matrix
audio_features = torch.tensor(X_audio)

# Convert the textual features into a tensor
text_features = torch.tensor(np.array(text_features.tolist()))

# Merge the audio and textual features
combined_features = torch.cat((audio_features, text_features), dim=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.3, random_state=42)

In [None]:
# Display the class distribution in the training and test sets

# Count the number of each class for y_train and y_test
train_class_distribution = pd.Series(y_train).value_counts()
test_class_distribution = pd.Series(y_test).value_counts()

# Display
print("Distribution of classes in the training set:")
print(train_class_distribution)
print("Distribution of classes in the test set:")
print(test_class_distribution)

In [None]:
# Define the Early Fusion Model
class EarlyFusionModel(nn.Module):
    def __init__(self, audio_feature_size, text_feature_size, hidden_size=64, dropout_rate=0.5):
        super(EarlyFusionModel, self).__init__()
        # CamemBERT for text features
        self.camembert = CamembertModel.from_pretrained('camembert-base')

        # RNN for audio features
        #print(f"hidden_size : {hidden_size}")
        self.audio_rnn = nn.GRU(audio_feature_size, hidden_size, batch_first=True)
        
        # Fusion and final layers
        #conv1d
        #self.conv1d = nn.Conv1d(1, 1, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(hidden_size + text_feature_size, 2)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, audio_features, text_features):
        # Process audio features with RNN
        #_, audio_x = self.audio_rnn(audio_features)
        # Assuming self.audio_rnn is your RNN
        audio_rnn_output, _ = self.audio_rnn(audio_features)  # Output shape [batch_size, seq_len, hidden_size]
        #print("audio_rnn_output shape:", audio_rnn_output.shape)

        #audio_x = audio_rnn_output[:, -1, :]  # Taking the last sequence output, shape [batch_size, hidden_size]

        #audio_x = audio_x.squeeze(0)  # If using LSTM, use audio_x[0].squeeze(0)
        audio_x=audio_rnn_output
        # Process text features
        text_x = self.camembert(text_features)[1]
        #print(f"shape text : {text_x.shape}")
        #print(f"shape audio : {audio_x.shape}")
        # Fuse audio and text features
        combined = torch.cat((audio_x, text_x), dim=1)
        #couche dense
        
        # Final layers
        #x=self.conv1d(combined.unsqueeze(1))
        #x = x.view(x.size(0), -1)
        #x = self.dropout(F.relu(self.fc1(x)))
        x=self.fc1(combined)
        #x = self.dropout(F.relu(self.fc2(x)))
        #x =self.fc3(x)
        return x
# Define the dataset
class CombinedDataset(Dataset):
    def __init__(self, audio_features, text_features, labels):
        self.audio_features = audio_features
        self.text_features = text_features
        self.labels = labels

    def __len__(self):
        return len(self.audio_features)

    def __getitem__(self, idx):
        audio_feature = self.audio_features[idx]
        text_feature = self.text_features[idx]
        label = self.labels[idx]

        # Convertir les étiquettes en tenseurs de type long
        label = torch.tensor(label).long()

        return audio_feature, text_feature, label


# Assuming X_audio, text_features, and y are numpy arrays
# Get the size of the audio and text features
audio_feature_size = X_audio.shape[1]
print(f"audio_feature_size: {audio_feature_size}")
temp_camembert = CamembertModel.from_pretrained('camembert-base')
text_feature_size = temp_camembert.config.hidden_size

# Split the data into training and testing sets
X_audio_train, X_audio_test, text_features_train, text_features_test, y_train, y_test = train_test_split(X_audio, text_features, y, test_size=0.3, random_state=42)

# Create DataLoaders for the training and testing sets
train_dataset = CombinedDataset(torch.tensor(X_audio_train), torch.tensor(text_features_train), torch.tensor(y_train))
test_dataset = CombinedDataset(torch.tensor(X_audio_test), torch.tensor(text_features_test), torch.tensor(y_test))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Initialize the model
model = EarlyFusionModel(audio_feature_size, text_feature_size).to(device)

# Define class weights for BCEWithLogitsLoss
# Increase the weight for the positive class to handle class imbalance
w_pos = 10
print("Weight for positive class:", w_pos)
class_weights = torch.tensor([1.0, w_pos]).to(device)

# Use BCEWithLogitsLoss with weight for the positive class
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])

# Set up the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Set the number of training epochs
num_epochs = 2

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (audio_inputs, text_inputs, labels) in enumerate(train_loader):
        # Move inputs and labels to the device
        audio_inputs, text_inputs, labels = audio_inputs.to(device), text_inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(audio_inputs, text_inputs)

        # Compute loss
        loss = criterion(outputs.squeeze(), labels.float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item()
        
    torch.save(model, 'modele/early_model_epoch_'+str(epoch))
    # Calculate and print the average loss over the epoch
    avg_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

# Initialize counters for accuracy calculation
correct = 0
total = 0

# Initialize lists to store all predictions and labels for F1 score and confusion matrix
all_preds = []
all_labels = []

# Switch model to evaluation mode
model.eval()

# Evaluation loop
with torch.no_grad():
    for audio_inputs, text_inputs, labels in test_loader:
        # Move inputs and labels to the device
        audio_inputs, text_inputs, labels = audio_inputs.to(device), text_inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(audio_inputs, text_inputs)

        # Apply sigmoid function and threshold at 0.5 to get predictions
        predicted = torch.sigmoid(outputs).squeeze() > 0.5

        # Update counters for accuracy calculation
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Store predictions and labels for F1 score and confusion matrix
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate and print accuracy
accuracy = correct / total
print(f'Accuracy: {accuracy}')

# Calculate and print F1 score
f1 = f1_score(all_labels, all_preds)
print(f'F1 Score: {f1}')

# Calculate and print confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print('Confusion Matrix:')
print(conf_matrix)