# Importation de bibliothèque et  fonctions

In [5]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score,  classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import seaborn as sns
import csv
import os
import random
import numpy as np
from sklearn.utils import resample



In [7]:
def feature_extractor_1(audio_file_dir):

    #load the audio files
    x,freq = librosa.load(audio_file_dir,sr=16000)
    #extract 20 MFCCs
    mfcc=librosa.feature.mfcc(y=x,sr=freq,n_mfcc=20)
    #calculate the mean and variance of each MFFC 
    mean_mfccs=np.mean(mfcc,axis=1)
    var_mfccs=np.var(mfcc,axis=1)
    #return mean and variance as the audio file feature 
    return list(mean_mfccs)+list(var_mfccs)

In [3]:
def feature_extractor_2(audio_file_dir):

    #load the audio files
    x,freq = librosa.load(audio_file_dir,sr=16000)
    # trim the first 5 seconds (Sequence Truncation)
    length_of_5seconds=5*16000
    x_5sec=x[:length_of_5seconds]
    # extract 20 MFCCs
    mfccs_5sec=librosa.feature.mfcc(y=x_5sec,sr=freq,n_mfcc=20)
    # return mfcc of the first 5 sec as the audio file feature
    return mfccs_5sec

In [8]:
#set data_dir to the directory of your data files
data_dir= "Dataset/"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.txt", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])
        # The last column contains the lable (language)
        label_list.append(row[-1]) 
        
        
# create a dictionary for labels
lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}

# create a list of extracted feature (MFCC) for files
x_data=[]

for audio_file in file_list:
    file_feature = feature_extractor_1(data_dir+audio_file)
    #add extracted feature to dataset 
    x_data.append(file_feature)

# create a list of labels for files
y_data=[]
for lang_label in label_list:
    #convert the label to a value in {0,1,2,3} as the class label
    y_data.append(lang_dic[lang_label])

  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


In [9]:
# shuffle two lists
temp_list = list(zip(x_data, y_data))
random.shuffle(temp_list)
x_data, y_data = zip(*temp_list)

In [10]:
# Train-test split
x_train, x_dev, y_train, y_dev = train_test_split(x_data, y_data, test_size=0.2, random_state=42)


# Modèles

## RandomForest

In [7]:
# Grid Search pour Random Forest
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_rf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_rf.fit(x_train, y_train)
print("Best parameters for RF:", grid_rf.best_params_)
print("Best cross-validated score:", grid_rf.best_score_)

# Meilleur modèle
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(x_dev)


Best parameters for RF: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validated score: 0.6135274356103023


**Utilisation de la fonction feature 1**

In [8]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="BETCHEM_WARREN_RandomForest_2.0v1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[best_rf.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:JP
0001.wav:FR
0002.wav:FR
0003.wav:AR
0004.wav:FR
0005.wav:AR
0006.wav:FR
0007.wav:JP
0008.wav:EN
0009.wav:JP
0010.wav:EN
0011.wav:AR
0012.wav:JP
0013.wav:JP
0014.wav:FR
0015.wav:JP
0016.wav:EN
0017.wav:FR
0018.wav:FR
0019.wav:AR
0020.wav:FR
0021.wav:JP
0022.wav:JP
0023.wav:EN
0024.wav:EN
0025.wav:AR
0026.wav:EN
0027.wav:AR
0028.wav:AR
0029.wav:AR
0030.wav:EN
0031.wav:EN
0032.wav:EN
0033.wav:FR
0034.wav:AR
0035.wav:FR
0036.wav:FR
0037.wav:JP
0038.wav:JP
0039.wav:AR
0040.wav:FR
0041.wav:AR
0042.wav:EN
0043.wav:FR
0044.wav:EN
0045.wav:JP
0046.wav:JP
0047.wav:JP
0048.wav:JP
0049.wav:JP
0050.wav:AR
0051.wav:FR
0052.wav:AR
0053.wav:EN
0054.wav:AR
0055.wav:FR
0056.wav:AR
0057.wav:EN
0058.wav:FR
0059.wav:EN
0060.wav:EN
0061.wav:JP
0062.wav:JP
0063.wav:AR
0064.wav:FR
0065.wav:FR
0066.wav:FR
0067.wav:AR
0068.wav:JP
0069.wav:EN
0070.wav:JP
0071.wav:AR
0072.wav:EN
0073.wav:AR
0074.wav:FR
0075.wav:EN
0076.wav:FR
0077.wav:FR
0078.wav:JP
0079.wav:EN
0080.wav:JP
0081.wav:FR
0082.wav:AR
0083

## GNB

In [9]:
from sklearn.model_selection import GridSearchCV

# Grid Search pour GNB
gnb = GaussianNB()
param_grid = {
    'var_smoothing': np.logspace(-9, 0, 10)
}
grid_gnb = GridSearchCV(gnb, param_grid, cv=5, scoring='accuracy')
grid_gnb.fit(x_train, y_train)
print("Best parameters for GNB:", grid_gnb.best_params_)
print("Best cross-validated score:", grid_gnb.best_score_)

# Meilleur modèle
best_gnb = grid_gnb.best_estimator_
y_pred_gnb = best_gnb.predict(x_dev)


Best parameters for GNB: {'var_smoothing': 1e-06}
Best cross-validated score: 0.41825307950727886


In [18]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="BETCHEM_WARREN_GNB_2.0v1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[best_gnb.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:JP
0001.wav:FR
0002.wav:FR
0003.wav:FR
0004.wav:FR
0005.wav:JP
0006.wav:FR
0007.wav:FR
0008.wav:EN
0009.wav:FR
0010.wav:EN
0011.wav:FR
0012.wav:FR
0013.wav:JP
0014.wav:FR
0015.wav:JP
0016.wav:FR
0017.wav:FR
0018.wav:FR
0019.wav:FR
0020.wav:FR
0021.wav:AR
0022.wav:JP
0023.wav:FR
0024.wav:JP
0025.wav:AR
0026.wav:FR
0027.wav:FR
0028.wav:FR
0029.wav:FR
0030.wav:FR
0031.wav:FR
0032.wav:FR
0033.wav:FR
0034.wav:EN
0035.wav:FR
0036.wav:FR
0037.wav:AR
0038.wav:EN
0039.wav:FR
0040.wav:FR
0041.wav:FR
0042.wav:EN
0043.wav:FR
0044.wav:FR
0045.wav:JP
0046.wav:FR
0047.wav:EN
0048.wav:AR
0049.wav:JP
0050.wav:FR
0051.wav:EN
0052.wav:FR
0053.wav:FR
0054.wav:FR
0055.wav:FR
0056.wav:AR
0057.wav:EN
0058.wav:FR
0059.wav:EN
0060.wav:FR
0061.wav:JP
0062.wav:AR
0063.wav:AR
0064.wav:FR
0065.wav:FR
0066.wav:EN
0067.wav:FR
0068.wav:EN
0069.wav:AR
0070.wav:FR
0071.wav:FR
0072.wav:AR
0073.wav:AR
0074.wav:FR
0075.wav:EN
0076.wav:FR
0077.wav:FR
0078.wav:AR
0079.wav:FR
0080.wav:AR
0081.wav:FR
0082.wav:FR
0083

## SVM 

In [40]:
# Grid Search pour SVM
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}
grid_svm = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_svm.fit(x_train, y_train)
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best cross-validated score:", grid_svm.best_score_)

# Meilleur modèle
best_svm = grid_svm.best_estimator_
y_pred_svm = best_svm.predict(x_dev)

KeyboardInterrupt: 

In [None]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="BETCHEM_WARREN_SVM_2.0v1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[best_sv.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

## MLP

In [12]:
# Grid Search pour MLP
mlp = MLPClassifier(max_iter=500, random_state=42)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}
grid_mlp = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy')
grid_mlp.fit(x_train, y_train)
print("Best parameters for MLP:", grid_mlp.best_params_)
print("Best cross-validated score:", grid_mlp.best_score_)

# Meilleur modèle
best_mlp = grid_mlp.best_estimator_
y_pred_mlp = best_mlp.predict(x_dev)


Best parameters for MLP: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
Best cross-validated score: 0.4055095184770437


In [13]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="BETCHEM_WARREN_MLP_2.0v1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[best_mlp.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:JP
0001.wav:FR
0002.wav:FR
0003.wav:AR
0004.wav:AR
0005.wav:AR
0006.wav:AR
0007.wav:AR
0008.wav:AR
0009.wav:FR
0010.wav:AR
0011.wav:AR
0012.wav:AR
0013.wav:FR
0014.wav:FR
0015.wav:AR
0016.wav:AR
0017.wav:FR
0018.wav:AR
0019.wav:AR
0020.wav:FR
0021.wav:AR
0022.wav:AR
0023.wav:FR
0024.wav:FR
0025.wav:AR
0026.wav:AR
0027.wav:AR
0028.wav:AR
0029.wav:AR
0030.wav:AR
0031.wav:AR
0032.wav:AR
0033.wav:AR
0034.wav:AR
0035.wav:EN
0036.wav:AR
0037.wav:AR
0038.wav:AR
0039.wav:AR
0040.wav:AR
0041.wav:AR
0042.wav:EN
0043.wav:AR
0044.wav:FR
0045.wav:JP
0046.wav:FR
0047.wav:AR
0048.wav:AR
0049.wav:JP
0050.wav:AR
0051.wav:AR
0052.wav:AR
0053.wav:AR
0054.wav:AR
0055.wav:AR
0056.wav:AR
0057.wav:AR
0058.wav:AR
0059.wav:AR
0060.wav:AR
0061.wav:FR
0062.wav:AR
0063.wav:AR
0064.wav:FR
0065.wav:FR
0066.wav:AR
0067.wav:AR
0068.wav:AR
0069.wav:EN
0070.wav:AR
0071.wav:EN
0072.wav:EN
0073.wav:AR
0074.wav:FR
0075.wav:AR
0076.wav:EN
0077.wav:FR
0078.wav:AR
0079.wav:AR
0080.wav:AR
0081.wav:AR
0082.wav:FR
0083

## Comparaison des performances

In [21]:
# Comparaison des performances
models = {
    "GNB": (best_gnb, y_pred_gnb),
    #"SVM": (best_svm, y_pred_svm),
    "RF": (best_rf, y_pred_rf),
    "MLP": (best_mlp, y_pred_mlp)
}

for name, (model, predictions) in models.items():
    print(f"Performance of {name}:")
    print(f"  Accuracy: {accuracy_score(y_dev, predictions):.4f}")
    print(f"  F1-Score: {f1_score(y_dev, predictions, average='weighted'):.4f}")
    print()


Performance of GNB:
  Accuracy: 0.3729
  F1-Score: 0.3700

Performance of RF:
  Accuracy: 0.6864
  F1-Score: 0.6940

Performance of MLP:
  Accuracy: 0.4068
  F1-Score: 0.3457



## Modèle Dummy pour comparaison

In [19]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score

# Différentes stratégies
strategies = ["most_frequent", "stratified", "uniform"]
results = {}

for strategy in strategies:
    dummy = DummyClassifier(strategy=strategy, random_state=42)
    dummy.fit(x_train, y_train)
    y_pred_dummy = dummy.predict(x_dev)
    
    accuracy = accuracy_score(y_dev, y_pred_dummy)
    f1 = f1_score(y_dev, y_pred_dummy, average='weighted')
    
    results[strategy] = {"Accuracy": accuracy, "F1-Score": f1}
    print(f"Strategy: {strategy}")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print("-" * 30)

# Afficher le meilleur score
best_strategy = max(results, key=lambda x: results[x]["Accuracy"])
print(f"Best Dummy Strategy: {best_strategy} with Accuracy {results[best_strategy]['Accuracy']:.4f}")


Strategy: most_frequent
  Accuracy: 0.1780
  F1-Score: 0.0538
------------------------------
Strategy: stratified
  Accuracy: 0.2881
  F1-Score: 0.2987
------------------------------
Strategy: uniform
  Accuracy: 0.2797
  F1-Score: 0.2696
------------------------------
Best Dummy Strategy: stratified with Accuracy 0.2881


In [20]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="BETCHEM_WARREN_Dimmy_2.0v1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[dummy.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:AR
0001.wav:AR
0002.wav:AR
0003.wav:AR
0004.wav:AR
0005.wav:AR
0006.wav:AR
0007.wav:AR
0008.wav:AR
0009.wav:AR
0010.wav:AR
0011.wav:AR
0012.wav:AR
0013.wav:AR
0014.wav:AR
0015.wav:AR
0016.wav:AR
0017.wav:AR
0018.wav:AR
0019.wav:AR
0020.wav:AR
0021.wav:AR
0022.wav:AR
0023.wav:AR
0024.wav:AR
0025.wav:AR
0026.wav:AR
0027.wav:AR
0028.wav:AR
0029.wav:AR
0030.wav:AR
0031.wav:AR
0032.wav:AR
0033.wav:AR
0034.wav:AR
0035.wav:AR
0036.wav:AR
0037.wav:AR
0038.wav:AR
0039.wav:AR
0040.wav:AR
0041.wav:AR
0042.wav:AR
0043.wav:AR
0044.wav:AR
0045.wav:AR
0046.wav:AR
0047.wav:AR
0048.wav:AR
0049.wav:AR
0050.wav:AR
0051.wav:AR
0052.wav:AR
0053.wav:AR
0054.wav:AR
0055.wav:AR
0056.wav:AR
0057.wav:AR
0058.wav:AR
0059.wav:AR
0060.wav:AR
0061.wav:AR
0062.wav:AR
0063.wav:AR
0064.wav:AR
0065.wav:AR
0066.wav:AR
0067.wav:AR
0068.wav:AR
0069.wav:AR
0070.wav:AR
0071.wav:AR
0072.wav:AR
0073.wav:AR
0074.wav:AR
0075.wav:AR
0076.wav:AR
0077.wav:AR
0078.wav:AR
0079.wav:AR
0080.wav:AR
0081.wav:AR
0082.wav:AR
0083

# Intervalle de confiance

In [25]:
# Fonction pour calculer l'intervalle de confiance
def bootstrap_confidence_interval(model, x_test, y_test, metric, n_bootstrap=1000, alpha=0.05):
    """
    Calcule un intervalle de confiance pour une métrique donnée en utilisant le bootstrap.

    :param model: Modèle à évaluer
    :param x_test: Données de test
    :param y_test: Labels de test
    :param metric: Fonction de la métrique (par exemple, accuracy_score ou f1_score)
    :param n_bootstrap: Nombre d'itérations bootstrap
    :param alpha: Niveau de confiance (par défaut, 95 %)
    :return: Intervalle de confiance (borne inférieure, borne supérieure)
    """
    scores = []

    # Générer des échantillons bootstrap et calculer la métrique pour chacun
    for _ in range(n_bootstrap):
        indices = resample(np.arange(len(x_test)), replace=True)
        x_sample = x_test[indices]
        y_sample = y_test[indices]
        y_pred = model.predict(x_sample)
        scores.append(metric(y_sample, y_pred))
    
    # Calculer les bornes de l'intervalle de confiance
    lower_bound = np.percentile(scores, 100 * (alpha / 2))
    upper_bound = np.percentile(scores, 100 * (1 - alpha / 2))
    return lower_bound, upper_bound

# Modèles à tester
models = {
    "GNB": best_gnb,
    #"SVM": best_svm,
    "Random Forest": best_rf,
    "MLP": best_mlp
}

# Calcul des intervalles de confiance pour chaque modèle
for model_name, model in models.items():
    x_dev = np.array(x_dev)
    y_dev = np.array(y_dev)
    accuracy_interval = bootstrap_confidence_interval(model, x_dev, y_dev, accuracy_score)
    f1_interval = bootstrap_confidence_interval(model, x_dev, y_dev, lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted'))
    
    print(f"Modèle : {model_name}")
    print(f"  Intervalle de confiance (95 %) pour l'exactitude : {accuracy_interval}")
    print(f"  Intervalle de confiance (95 %) pour le F1-Score : {f1_interval}")
    print("-" * 40)


Modèle : GNB
  Intervalle de confiance (95 %) pour l'exactitude : (0.288135593220339, 0.4576271186440678)
  Intervalle de confiance (95 %) pour le F1-Score : (0.2839743291687461, 0.47138238222261575)
----------------------------------------
Modèle : Random Forest
  Intervalle de confiance (95 %) pour l'exactitude : (0.6016949152542372, 0.7627118644067796)
  Intervalle de confiance (95 %) pour le F1-Score : (0.6100609829719781, 0.7759240906862226)
----------------------------------------
Modèle : MLP
  Intervalle de confiance (95 %) pour l'exactitude : (0.3135593220338983, 0.5)
  Intervalle de confiance (95 %) pour le F1-Score : (0.2438425279129819, 0.4410121411604957)
----------------------------------------


# Utilisation de PCA


In [27]:
# Initialisation de PCA pour conserver 95 % de la variance
pca = PCA(n_components=0.95, random_state=42)

# Ajustement de PCA sur les données d'entraînement et transformation des données
x_train_pca = pca.fit_transform(x_train)
x_dev_pca = pca.transform(x_dev)

# Affichage du nombre de composantes retenues
print(f"Nombre de composantes après PCA : {pca.n_components_}")


Nombre de composantes après PCA : 1


## RandomForest avec PCA

In [28]:
rf_pca = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_pca.fit(x_train_pca, y_train)
y_pred_rf_pca = rf_pca.predict(x_dev_pca)

# Évaluation
accuracy_rf_pca = accuracy_score(y_dev, y_pred_rf_pca)
f1_rf_pca = f1_score(y_dev, y_pred_rf_pca, average='weighted')
print(f"Random Forest avec PCA - Accuracy : {accuracy_rf_pca:.4f}, F1-Score : {f1_rf_pca:.4f}")


Random Forest avec PCA - Accuracy : 0.2881, F1-Score : 0.2902


In [29]:
def bootstrap_confidence_interval_with_pca(model, pca, x_test, y_test, metric, n_bootstrap=1000, alpha=0.05):
    """
    Calcule un intervalle de confiance pour une métrique donnée en utilisant le bootstrap avec PCA.

    :param model: Modèle à évaluer
    :param pca: Modèle PCA déjà ajusté
    :param x_test: Données de test
    :param y_test: Labels de test
    :param metric: Fonction de la métrique
    :param n_bootstrap: Nombre d'itérations bootstrap
    :param alpha: Niveau de confiance
    :return: Intervalle de confiance (borne inférieure, borne supérieure)
    """
    scores = []

    for _ in range(n_bootstrap):
        indices = resample(np.arange(len(x_test)), replace=True)
        x_sample = pca.transform(x_test[indices])  # Appliquer PCA
        y_sample = y_test[indices]
        y_pred = model.predict(x_sample)
        scores.append(metric(y_sample, y_pred))

    lower_bound = np.percentile(scores, 100 * (alpha / 2))
    upper_bound = np.percentile(scores, 100 * (1 - alpha / 2))
    return lower_bound, upper_bound

# Exemple d'utilisation pour Random Forest avec PCA
accuracy_interval_rf_pca = bootstrap_confidence_interval_with_pca(rf_pca, pca, x_dev, y_dev, accuracy_score)
print(f"Intervalle de confiance pour Random Forest avec PCA (Accuracy) : {accuracy_interval_rf_pca}")


Intervalle de confiance pour Random Forest avec PCA (Accuracy) : (0.211864406779661, 0.3728813559322034)


In [31]:
# Résultats avec et sans PCA
results = {
    #"Random Forest (sans PCA)": {"Accuracy": accuracy_rf, "F1-Score": f1_rf},
    "Random Forest (avec PCA)": {"Accuracy": accuracy_rf_pca, "F1-Score": f1_rf_pca},
}

for model, metrics in results.items():
    print(f"{model} - Accuracy: {metrics['Accuracy']:.4f}, F1-Score: {metrics['F1-Score']:.4f}")


Random Forest (avec PCA) - Accuracy: 0.2881, F1-Score: 0.2902


## SVM avec PCA

In [None]:
# SVM avec PCA
svm_pca = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
svm_pca.fit(x_train_pca, y_train)
y_pred_svm_pca = svm_pca.predict(x_dev_pca)

# Évaluation
accuracy_svm_pca = accuracy_score(y_dev, y_pred_svm_pca)
f1_svm_pca = f1_score(y_dev, y_pred_svm_pca, average='weighted')
print(f"SVM avec PCA - Accuracy : {accuracy_svm_pca:.4f}, F1-Score : {f1_svm_pca:.4f}")


## MLP avec PCA

In [32]:
# MLP avec PCA
mlp_pca = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_pca.fit(x_train_pca, y_train)
y_pred_mlp_pca = mlp_pca.predict(x_dev_pca)

# Évaluation
accuracy_mlp_pca = accuracy_score(y_dev, y_pred_mlp_pca)
f1_mlp_pca = f1_score(y_dev, y_pred_mlp_pca, average='weighted')
print(f"MLP avec PCA - Accuracy : {accuracy_mlp_pca:.4f}, F1-Score : {f1_mlp_pca:.4f}")


MLP avec PCA - Accuracy : 0.2373, F1-Score : 0.0910


## GNB avec PCA

In [33]:
# GNB avec PCA
gnb_pca = GaussianNB()
gnb_pca.fit(x_train_pca, y_train)
y_pred_gnb_pca = gnb_pca.predict(x_dev_pca)

# Évaluation
accuracy_gnb_pca = accuracy_score(y_dev, y_pred_gnb_pca)
f1_gnb_pca = f1_score(y_dev, y_pred_gnb_pca, average='weighted')
print(f"GNB avec PCA - Accuracy : {accuracy_gnb_pca:.4f}, F1-Score : {f1_gnb_pca:.4f}")


GNB avec PCA - Accuracy : 0.2627, F1-Score : 0.2131


In [37]:
# Résultats avec et sans PCA
results = {
    "GNB": {#"Sans PCA": {"Accuracy": accuracy_gnb, "F1-Score": f1_gnb},
            "Avec PCA": {"Accuracy": accuracy_gnb_pca, "F1-Score": f1_gnb_pca}},
    #"SVM": {"Sans PCA": {"Accuracy": accuracy_svm, "F1-Score": f1_svm},
            #"Avec PCA": {"Accuracy": accuracy_svm_pca, "F1-Score": f1_svm_pca}},
    "RF": {#"Sans PCA": {"Accuracy": accuracy_rf, "F1-Score": f1_rf},
           "Avec PCA": {"Accuracy": accuracy_rf_pca, "F1-Score": f1_rf_pca}},
    "MLP": {#"Sans PCA": {"Accuracy": accuracy_mlp, "F1-Score": f1_mlp},
            "Avec PCA": {"Accuracy": accuracy_mlp_pca, "F1-Score": f1_mlp_pca}},
}

# Afficher les résultats
for model, metrics in results.items():
    print(f"\nModèle : {model}")
    for condition, values in metrics.items():
        print(f"  {condition} - Accuracy : {values['Accuracy']:.4f}, F1-Score : {values['F1-Score']:.4f}")



Modèle : GNB
  Avec PCA - Accuracy : 0.2627, F1-Score : 0.2131

Modèle : RF
  Avec PCA - Accuracy : 0.2881, F1-Score : 0.2902

Modèle : MLP
  Avec PCA - Accuracy : 0.2373, F1-Score : 0.0910


# Test sur chaque modèle pour chaque hyperparamètres

In [15]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time

def bootstrap_confidence_interval(model, x_test, y_test, metric, n_bootstrap=1000, alpha=0.05):
    scores = []
    x_test = np.array(x_test)  # Conversion en tableau NumPy
    y_test = np.array(y_test)  # Conversion en tableau NumPy

    for _ in range(n_bootstrap):
        indices = resample(np.arange(len(x_test)), replace=True)
        x_sample = x_test[indices]
        y_sample = y_test[indices]
        y_pred = model.predict(x_sample)
        score = metric(y_sample, y_pred)
        scores.append(score)
    
    lower_bound = np.percentile(scores, alpha / 2 * 100)
    upper_bound = np.percentile(scores, (1 - alpha / 2) * 100)
    return (lower_bound, upper_bound)

def evaluate_model(model, x_train, y_train, x_dev, y_dev, metric_func, additional_params=None):
    if additional_params:
        model.set_params(**additional_params)
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_dev = np.array(x_dev)
    y_dev = np.array(y_dev)

    start_time = time.time()
    model.fit(x_train, y_train)
    train_time = time.time() - start_time

    y_pred = model.predict(x_dev)
    accuracy = accuracy_score(y_dev, y_pred)
    f1 = f1_score(y_dev, y_pred, average='weighted')
    confidence_interval = bootstrap_confidence_interval(model, x_dev, y_dev, metric_func)

    return {
        "Accuracy": accuracy,
        "F1-Score": f1,
        "Time (s)": train_time,
        "Confidence Interval": confidence_interval,
    }


In [16]:
from sklearn.utils import resample
import numpy as np

def bootstrap_confidence_interval(model, x_test, y_test, metric, n_bootstrap=1000, alpha=0.05):
    """
    Calcule un intervalle de confiance pour une métrique donnée en utilisant le bootstrap.
    
    :param model: Le modèle entraîné
    :param x_test: Les données de test
    :param y_test: Les labels de test
    :param metric: Fonction pour calculer la métrique (par ex. accuracy_score)
    :param n_bootstrap: Nombre d'échantillons bootstrap
    :param alpha: Niveau de confiance (par défaut 95%)
    :return: Tuple contenant les bornes inférieure et supérieure de l'intervalle de confiance
    """
    scores = []
    for _ in range(n_bootstrap):
        # Générer des indices bootstrap
        indices = resample(np.arange(len(x_test)), replace=True)
        x_sample = x_test[indices]
        y_sample = y_test[indices]
        # Prédire et calculer la métrique
        y_pred = model.predict(x_sample)
        score = metric(y_sample, y_pred)
        scores.append(score)
    
    # Calculer les percentiles pour l'intervalle de confiance
    lower_bound = np.percentile(scores, alpha / 2 * 100)
    upper_bound = np.percentile(scores, (1 - alpha / 2) * 100)
    return (lower_bound, upper_bound)


In [17]:
# Hyperparamètres à tester
rf_params = [{"n_estimators": 50, "max_depth": 10}, {"n_estimators": 100, "max_depth": 15}]
svm_params = [{"kernel": "poly", "C": 1}, {"kernel": "rbf", "C": 1}, {"kernel": "rbf", "C": 10},{"kernel": "poly", "C": 10},{"kernel": "sigmoid", "C": 1},{"kernel": "sigmoid", "C": 10}]
mlp_params = [{"hidden_layer_sizes": (50,), "max_iter": 300}, {"hidden_layer_sizes": (100,), "max_iter": 500}]
gnb_params = [{}]  # GNB n'a pas de paramètres significatifs à varier


In [18]:
# Initialiser les modèles
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

models_params = {
    "GaussianNB": (GaussianNB(), gnb_params),
    "SVM": (SVC(random_state=42), svm_params),
    "RandomForest": (RandomForestClassifier(random_state=42), rf_params),
    "MLP": (MLPClassifier(random_state=42), mlp_params),
}

# Résultats
results = {}

for model_name, (model, param_list) in models_params.items():
    print(f"Entraînement de {model_name}...")
    results[model_name] = []
    for params in param_list:
        result = evaluate_model(model, x_train, y_train, x_dev, y_dev, accuracy_score, params)
        result["Hyperparameters"] = params
        results[model_name].append(result)


Entraînement de GaussianNB...


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
import pandas as pd

for model_name, model_results in results.items():
    print(f"\nRésultats pour {model_name} :\n")
    df = pd.DataFrame(model_results)
    print(df[["Hyperparameters", "Accuracy", "F1-Score", "Time (s)", "Confidence Interval"]])
