In [1]:
import librosa

In [2]:
import numpy as np
def feature_extractor_1(audio_file_dir):

    #load the audio files
    x,freq = librosa.load(audio_file_dir,sr=16000)
    #extract 20 MFCCs
    mfcc=librosa.feature.mfcc(y=x,sr=freq,n_mfcc=20)
    #calculate the mean and variance of each MFFC 
    mean_mfccs=np.mean(mfcc,axis=1)
    var_mfccs=np.var(mfcc,axis=1)
    #return mean and variance as the audio file feature 
    return list(mean_mfccs)+list(var_mfccs)

In [3]:
def feature_extractor_2(audio_file_dir):

    #load the audio files
    x,freq = librosa.load(audio_file_dir,sr=16000)
    # trim the first 5 seconds (Sequence Truncation)
    length_of_5seconds=5*16000
    x_5sec=x[:length_of_5seconds]
    # extract 20 MFCCs
    mfccs_5sec=librosa.feature.mfcc(y=x_5sec,sr=freq,n_mfcc=20)
    # return mfcc of the first 5 sec as the audio file feature
    return mfccs_5sec

In [4]:
import csv

#set data_dir to the directory of your data files
data_dir="Dataset/"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.txt", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])
        # The last column contains the lable (language)
        label_list.append(row[-1]) 
        
        
# create a dictionary for labels
lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}

# create a list of extracted feature (MFCC) for files
x_data=[]

for audio_file in file_list:
    file_feature = feature_extractor_1(data_dir+audio_file)
    #add extracted feature to dataset 
    x_data.append(file_feature)

# create a list of labels for files
y_data=[]
for lang_label in label_list:
    #convert the label to a value in {0,1,2,3} as the class label
    y_data.append(lang_dic[lang_label])

  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


In [6]:
# Utilisation pour generer les caractésistiques du RandomForest sans les tableaux et les graphs
### 3. Shuffle your data

import random

# shuffle two lists
temp_list = list(zip(x_data, y_data))
random.shuffle(temp_list)
x_data, y_data = zip(*temp_list)



**Random Forest Model**

In [8]:


import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import seaborn as sns
import pandas as pd

# Supposons que x_data est une liste de listes (chaque élément est un MFCC)
# Convertir les éléments de x_data en tableaux NumPy
x_data = [np.array(mfcc) for mfcc in x_data]

x_data_flat = [mfcc.flatten() for mfcc in x_data] 

X_train, X_test, y_train, y_test = train_test_split(x_data_flat, y_data, test_size=0.1, random_state=42) 

# Utilisation des meilleurs paramètres pour RandomForest
best_params = {'bootstrap': False, 'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100}
clf = RandomForestClassifier(**best_params) 

clf.fit(X_train, y_train) 

# Prédiction sur l'ensemble d'entraînement 
y_train_pred = clf.predict(X_train) 
train_accuracy = accuracy_score(y_train, y_train_pred) 

# Prédiction sur l'ensemble de test 
y_test_pred = clf.predict(X_test) 
test_accuracy = accuracy_score(y_test, y_test_pred) 

print("Accurancy", clf.score(x_data_flat, y_data)) 
print("Training Accuracy:", train_accuracy) 
print("Test Accuracy:", test_accuracy)



Accurancy 0.9609507640067911
Training Accuracy: 0.9943396226415094
Test Accuracy: 0.6610169491525424


In [9]:

### 5. Have you used different data for train and test?


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Définir les grilles de paramètres pour Random Forest
param_grid_rf = {
    'n_estimators': [10, 100, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Créer des instances des classificateurs
#clf = RandomForestClassifier(n_estimators=10, max_depth=2, max_features='sqrt', random_state=42)
 
    
# Utilisation de 50% des caractéristiques
rf = RandomForestClassifier(random_state=42)



# Recherche d'hyperparamètres pour Random Forest
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)



best_rf=grid_search_rf.best_estimator_
# Afficher les meilleurs paramètres et scores
print("Best parameters for RandomForest:", grid_search_rf.best_params_)
print("Best cross-validation accuracy for RandomForest:", grid_search_rf.best_score_)


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Utilisateur\AppData\Roaming\Python\Python39\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Utilisateur\AppData\Roaming\Python\Python39\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\Utilisateur\AppData\Roaming\Python\Python39\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Utilisateur\AppData\Roaming\Python\Python39\site-packages\sklearn\u

Best parameters for RandomForest: {'bootstrap': False, 'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 1000}
Best cross-validation accuracy for RandomForest: 0.6698113207547169


In [10]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="YONKOUA_INS_NAWEL_RandomForestClassifier_V2"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[best_rf.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:JP
0001.wav:EN
0002.wav:FR
0003.wav:AR
0004.wav:EN
0005.wav:AR
0006.wav:FR
0007.wav:JP
0008.wav:JP
0009.wav:JP
0010.wav:EN
0011.wav:AR
0012.wav:JP
0013.wav:JP
0014.wav:FR
0015.wav:JP
0016.wav:JP
0017.wav:FR
0018.wav:AR
0019.wav:AR
0020.wav:FR
0021.wav:JP
0022.wav:JP
0023.wav:EN
0024.wav:JP
0025.wav:AR
0026.wav:EN
0027.wav:JP
0028.wav:AR
0029.wav:AR
0030.wav:EN
0031.wav:EN
0032.wav:FR
0033.wav:FR
0034.wav:AR
0035.wav:EN
0036.wav:FR
0037.wav:JP
0038.wav:JP
0039.wav:AR
0040.wav:FR
0041.wav:AR
0042.wav:EN
0043.wav:FR
0044.wav:EN
0045.wav:JP
0046.wav:JP
0047.wav:JP
0048.wav:JP
0049.wav:JP
0050.wav:FR
0051.wav:EN
0052.wav:AR
0053.wav:EN
0054.wav:JP
0055.wav:FR
0056.wav:AR
0057.wav:EN
0058.wav:FR
0059.wav:EN
0060.wav:EN
0061.wav:FR
0062.wav:AR
0063.wav:AR
0064.wav:FR
0065.wav:FR
0066.wav:FR
0067.wav:EN
0068.wav:JP
0069.wav:EN
0070.wav:EN
0071.wav:AR
0072.wav:EN
0073.wav:EN
0074.wav:FR
0075.wav:EN
0076.wav:FR
0077.wav:FR
0078.wav:JP
0079.wav:EN
0080.wav:JP
0081.wav:FR
0082.wav:AR
0083

0684.wav:FR
0685.wav:AR
0686.wav:EN
0687.wav:EN
0688.wav:AR
0689.wav:FR
0690.wav:FR
0691.wav:EN
0692.wav:AR
0693.wav:FR
0694.wav:FR
0695.wav:EN
0696.wav:JP
0697.wav:FR
0698.wav:AR
0699.wav:JP
0700.wav:FR
0701.wav:JP
0702.wav:EN
0703.wav:JP
0704.wav:AR
0705.wav:JP
0706.wav:JP
0707.wav:EN
0708.wav:EN
0709.wav:JP
0710.wav:FR
0711.wav:FR
0712.wav:FR
0713.wav:JP
0714.wav:JP
0715.wav:AR
0716.wav:AR
0717.wav:FR
0718.wav:AR
0719.wav:EN
0720.wav:AR
0721.wav:JP
0722.wav:EN
0723.wav:JP
0724.wav:EN
0725.wav:JP
0726.wav:JP
0727.wav:AR
0728.wav:FR
0729.wav:FR
0730.wav:FR
0731.wav:EN
0732.wav:JP
0733.wav:EN
0734.wav:AR
0735.wav:FR
0736.wav:EN
0737.wav:EN
0738.wav:FR
0739.wav:FR
0740.wav:EN
0741.wav:EN
0742.wav:FR
0743.wav:FR
0744.wav:EN
0745.wav:JP
0746.wav:FR
0747.wav:AR
0748.wav:JP
0749.wav:FR
0750.wav:JP
0751.wav:EN
0752.wav:AR
0753.wav:AR
0754.wav:EN
0755.wav:EN
0756.wav:JP
0757.wav:JP
0758.wav:EN
0759.wav:AR
0760.wav:FR
0761.wav:JP
0762.wav:AR
0763.wav:JP
0764.wav:FR
0765.wav:FR
0766.wav:FR
0767

**MLP model**

In [7]:
import random
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Supposons que x_data est une liste de listes (chaque élément est un MFCC)
# Shuffle des deux listes
temp_list = list(zip(x_data, y_data))
random.shuffle(temp_list)
x_data, y_data = zip(*temp_list)

# Convertir les éléments de x_data en tableaux NumPy
x_data = [np.array(mfcc) for mfcc in x_data]

# Aplatir les données
x_data_flat = [mfcc.flatten() for mfcc in x_data] 

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(x_data_flat, y_data, test_size=0.1, random_state=42) 

# Utilisation des meilleurs paramètres pour MLP
best_params = {'activation': 'tanh', 'hidden_layer_sizes': (1000, 100), 'solver': 'adam'}
clf = MLPClassifier(**best_params, max_iter=500, random_state=42)
clf.fit(X_train, y_train)

# Prédiction sur l'ensemble d'entraînement 
y_train_pred = clf.predict(X_train) 
train_accuracy = accuracy_score(y_train, y_train_pred) 

# Prédiction sur l'ensemble de test 
y_test_pred = clf.predict(X_test) 
test_accuracy = accuracy_score(y_test, y_test_pred) 

# Affichage des résultats
results_df = pd.DataFrame({
    "Métrique": ["Accuracy", "Training Accuracy", "Test Accuracy"],
    "Valeur": [clf.score(x_data_flat, y_data), train_accuracy, test_accuracy]
})

print("Résultats:")
print(results_df)

Résultats:
            Métrique    Valeur
0           Accuracy  0.410866
1  Training Accuracy  0.416981
2      Test Accuracy  0.355932


In [9]:
### 5. Have you used different data for train and test?


from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# Définir les grilles de paramètres pour MLP
param_grid_mlp = {
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'hidden_layer_sizes': [(100, 10), (1000, 100)]
}

mlp = MLPClassifier(random_state=42)

# Recherche d'hyperparamètres pour MLP
grid_search_mlp = GridSearchCV(mlp, param_grid_mlp, cv=5, scoring='accuracy')
grid_search_mlp.fit(X_train, y_train)

print("Best parameters for MLP:", grid_search_mlp.best_params_)
print("Best cross-validation accuracy for MLP:", grid_search_mlp.best_score_)



Best parameters for MLP: {'activation': 'tanh', 'hidden_layer_sizes': (100, 10), 'solver': 'adam'}
Best cross-validation accuracy for MLP: 0.39622641509433965


In [10]:
import csv
import numpy as np
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="YONKOUA_INS_NAWEL_MLP_V1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[grid_search_mlp.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:FR
0001.wav:EN
0002.wav:AR
0003.wav:JP
0004.wav:EN
0005.wav:AR
0006.wav:JP
0007.wav:JP
0008.wav:AR
0009.wav:JP
0010.wav:AR
0011.wav:AR
0012.wav:JP
0013.wav:JP
0014.wav:JP
0015.wav:EN
0016.wav:FR
0017.wav:FR
0018.wav:AR
0019.wav:FR
0020.wav:EN
0021.wav:JP
0022.wav:JP
0023.wav:EN
0024.wav:AR
0025.wav:AR
0026.wav:EN
0027.wav:AR
0028.wav:AR
0029.wav:JP
0030.wav:EN
0031.wav:EN
0032.wav:JP
0033.wav:JP
0034.wav:EN
0035.wav:EN
0036.wav:JP
0037.wav:AR
0038.wav:AR
0039.wav:AR
0040.wav:FR
0041.wav:AR
0042.wav:AR
0043.wav:FR
0044.wav:FR
0045.wav:JP
0046.wav:JP
0047.wav:JP
0048.wav:JP
0049.wav:AR
0050.wav:JP
0051.wav:JP
0052.wav:JP
0053.wav:AR
0054.wav:FR
0055.wav:EN
0056.wav:AR
0057.wav:AR
0058.wav:JP
0059.wav:JP
0060.wav:EN
0061.wav:FR
0062.wav:EN
0063.wav:JP
0064.wav:JP
0065.wav:JP
0066.wav:JP
0067.wav:EN
0068.wav:JP
0069.wav:EN
0070.wav:AR
0071.wav:EN
0072.wav:EN
0073.wav:JP
0074.wav:FR
0075.wav:EN
0076.wav:AR
0077.wav:JP
0078.wav:JP
0079.wav:AR
0080.wav:AR
0081.wav:JP
0082.wav:AR
0083

# SVC

In [36]:
import csv

#set data_dir to the directory of your data files
data_dir= "Dataset/"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.txt", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])
        # The last column contains the lable (language)
        label_list.append(row[-1]) 
        
        
# create a dictionary for labels
lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}

# create a list of extracted feature (MFCC) for files
x_data=[]

for audio_file in file_list:
    #file_feature = feature_extractor_2(data_dir+audio_file)
    file_feature = feature_extractor_1(data_dir+audio_file)
    #add extracted feature to dataset 
    x_data.append(file_feature)

# create a list of labels for files
y_data=[]
for lang_label in label_list:
    #convert the label to a value in {0,1,2,3} as the class label
    y_data.append(lang_dic[lang_label])

In [44]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# 1. Division du jeu de données
X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data,
    test_size=0.33,
    shuffle=True,
    stratify=y_data,
    random_state=42
)

# 2. Pipeline : StandardScaler + SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# 3. Grille d'hyperparamètres étendue
param_grid = {
    'svm__C': [0.01, 0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': ['scale', 'auto', 0.01, 0.001, 0.0001]
}

# 4. GridSearchCV avec validation croisée
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# 5. Résultats
print(" Meilleurs hyperparamètres :", grid_search.best_params_)
print(" Score moyen en validation croisée :", grid_search.best_score_)

# 6. Évaluation sur le jeu de test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(" Accuracy sur le jeu de test :", accuracy_score(y_test, y_pred))
print("\n Rapport de classification :\n")
print(classification_report(y_test, y_pred, target_names=["EN", "FR", "AR", "JP"])) 

Fitting 5 folds for each of 50 candidates, totalling 250 fits
 Meilleurs hyperparamètres : {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
 Score moyen en validation croisée : 0.6268419344368712
 Accuracy sur le jeu de test : 0.676923076923077

 Rapport de classification :

              precision    recall  f1-score   support

          EN       0.66      0.65      0.65        48
          FR       0.74      0.64      0.69        50
          AR       0.67      0.69      0.68        49
          JP       0.65      0.73      0.69        48

    accuracy                           0.68       195
   macro avg       0.68      0.68      0.68       195
weighted avg       0.68      0.68      0.68       195



In [42]:
from joblib import dump
dump(grid_search.best_estimator_, "best_svm_pipeline.joblib")



['best_svm_pipeline.joblib']

In [54]:
import csv
import numpy as np
from joblib import load
#from feature_extractor import feature_extractor_1  # adapte si besoin

# Dossier contenant les fichiers de test et Info.csv
data_dir = "Test_Set/"
output_filename = "Nawel_TP2_SVM_Version2"

# Chargement du modèle entraîné (Pipeline)
model = load("best_svm_pipeline.joblib")

# Dictionnaires de correspondance
class2lang_dic = {0: "EN", 1: "FR", 2: "AR", 3: "JP"}

# Lecture du fichier Info.csv
file_list = []
with open(data_dir + "Info.csv", 'r') as f:
    reader = csv.reader(f)
    next(reader)  # Skip header
    for row in reader:
        file_list.append(row[0])  # nom du fichier audio

# Fichier de sortie
with open(data_dir + f"{output_filename}.csv", 'w') as f:
    f.write("ID,Label\n")

    for filename in file_list:
        filepath = data_dir + filename
        features = feature_extractor_1(filepath)  # → vecteur numpy
        features = np.array(features).reshape(1, -1)
        predicted_class = voting_clf.predict(features)[0]
        predicted_lang = class2lang_dic[predicted_class]

        print(f"{filename}: {predicted_lang}")
        f.write(f"{filename},{predicted_lang}\n")




0000.wav: JP
0001.wav: EN
0002.wav: FR
0003.wav: AR
0004.wav: FR
0005.wav: AR
0006.wav: AR
0007.wav: JP
0008.wav: EN
0009.wav: JP
0010.wav: AR
0011.wav: JP
0012.wav: AR
0013.wav: JP
0014.wav: FR
0015.wav: JP
0016.wav: AR
0017.wav: FR
0018.wav: AR
0019.wav: EN
0020.wav: FR
0021.wav: JP
0022.wav: JP
0023.wav: EN
0024.wav: EN
0025.wav: AR
0026.wav: EN
0027.wav: EN
0028.wav: AR
0029.wav: FR
0030.wav: EN
0031.wav: EN
0032.wav: EN
0033.wav: FR
0034.wav: AR
0035.wav: EN
0036.wav: FR
0037.wav: JP
0038.wav: JP
0039.wav: AR
0040.wav: FR
0041.wav: AR
0042.wav: EN
0043.wav: EN
0044.wav: FR
0045.wav: JP
0046.wav: JP
0047.wav: JP
0048.wav: JP
0049.wav: JP
0050.wav: FR
0051.wav: FR
0052.wav: AR
0053.wav: EN
0054.wav: EN
0055.wav: AR
0056.wav: AR
0057.wav: EN
0058.wav: JP
0059.wav: EN
0060.wav: EN
0061.wav: EN
0062.wav: JP
0063.wav: JP
0064.wav: FR
0065.wav: FR
0066.wav: FR
0067.wav: AR
0068.wav: JP
0069.wav: EN
0070.wav: EN
0071.wav: AR
0072.wav: EN
0073.wav: JP
0074.wav: FR
0075.wav: EN
0076.wav: AR

0633.wav: FR
0634.wav: JP
0635.wav: AR
0636.wav: EN
0637.wav: JP
0638.wav: EN
0639.wav: EN
0640.wav: JP
0641.wav: JP
0642.wav: FR
0643.wav: FR
0644.wav: EN
0645.wav: AR
0646.wav: EN
0647.wav: FR
0648.wav: JP
0649.wav: JP
0650.wav: EN
0651.wav: FR
0652.wav: EN
0653.wav: JP
0654.wav: JP
0655.wav: JP
0656.wav: FR
0657.wav: FR
0658.wav: JP
0659.wav: AR
0660.wav: FR
0661.wav: FR
0662.wav: AR
0663.wav: EN
0664.wav: AR
0665.wav: JP
0666.wav: JP
0667.wav: AR
0668.wav: EN
0669.wav: EN
0670.wav: AR
0671.wav: EN
0672.wav: EN
0673.wav: FR
0674.wav: FR
0675.wav: AR
0676.wav: JP
0677.wav: FR
0678.wav: EN
0679.wav: JP
0680.wav: JP
0681.wav: EN
0682.wav: JP
0683.wav: EN
0684.wav: JP
0685.wav: AR
0686.wav: AR
0687.wav: EN
0688.wav: AR
0689.wav: FR
0690.wav: FR
0691.wav: EN
0692.wav: AR
0693.wav: FR
0694.wav: FR
0695.wav: FR
0696.wav: JP
0697.wav: FR
0698.wav: AR
0699.wav: JP
0700.wav: FR
0701.wav: FR
0702.wav: EN
0703.wav: EN
0704.wav: JP
0705.wav: JP
0706.wav: JP
0707.wav: AR
0708.wav: EN
0709.wav: JP

# GNB

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2,shuffle=True,random_state=42)
gnb = GaussianNB(var_smoothing=1e-8)
pred_y=gnb.fit(x_train,y_train).predict(x_test)
print("Accuracy of Gaussian Naive Bayes:", accuracy_score(y_test, pred_y))

Accuracy of Gaussian Naive Bayes: 0.4152542372881356


In [36]:
from sklearn.model_selection import GridSearchCV


# Définir la grille d'hyperparamètres

param_grid = {'var_smoothing': np.logspace(-15, -3, num=25)}  # Plage plus large

gnb = GaussianNB()

grid_search = GridSearchCV(gnb, param_grid, cv=5, scoring='accuracy', n_jobs=-1,verbose=1)
grid_search.fit(x_train, y_train)

# Afficher les meilleurs paramètres
print("Meilleurs paramètres :", grid_search.best_params_)
print("Meilleure exactitude (validation croisée) :", grid_search.best_score_)

best_gnb = grid_search.best_estimator_
pred_y = best_gnb.predict(x_test)
accuracy = accuracy_score(y_test, pred_y)
print(f"Exactitude sur l'ensemble de validation : {accuracy:.4f}")



Fitting 5 folds for each of 25 candidates, totalling 125 fits
Meilleurs paramètres : {'var_smoothing': 3.162277660168379e-08}
Meilleure exactitude (validation croisée) : 0.4183426651735722
Exactitude sur l'ensemble de validation : 0.4576


In [37]:
#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="yonkoua_NAWEL_GNB_V3"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[best_gnb.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:JP
0001.wav:FR
0002.wav:FR
0003.wav:JP
0004.wav:FR
0005.wav:EN
0006.wav:FR
0007.wav:FR
0008.wav:EN
0009.wav:FR
0010.wav:EN
0011.wav:FR
0012.wav:JP
0013.wav:JP
0014.wav:FR
0015.wav:JP
0016.wav:FR
0017.wav:FR
0018.wav:AR
0019.wav:EN
0020.wav:FR
0021.wav:AR
0022.wav:JP
0023.wav:EN
0024.wav:JP
0025.wav:AR
0026.wav:EN
0027.wav:AR
0028.wav:FR
0029.wav:FR
0030.wav:FR
0031.wav:EN
0032.wav:FR
0033.wav:FR
0034.wav:EN
0035.wav:FR
0036.wav:FR
0037.wav:JP
0038.wav:EN
0039.wav:FR
0040.wav:FR
0041.wav:FR
0042.wav:EN
0043.wav:FR
0044.wav:EN
0045.wav:JP
0046.wav:FR
0047.wav:EN
0048.wav:AR
0049.wav:JP
0050.wav:FR
0051.wav:EN
0052.wav:AR
0053.wav:FR
0054.wav:FR
0055.wav:EN
0056.wav:AR
0057.wav:EN
0058.wav:FR
0059.wav:EN
0060.wav:FR
0061.wav:JP
0062.wav:AR
0063.wav:AR
0064.wav:FR
0065.wav:FR
0066.wav:EN
0067.wav:FR
0068.wav:EN
0069.wav:AR
0070.wav:EN
0071.wav:FR
0072.wav:AR
0073.wav:AR
0074.wav:FR
0075.wav:EN
0076.wav:FR
0077.wav:FR
0078.wav:JP
0079.wav:EN
0080.wav:AR
0081.wav:FR
0082.wav:FR
0083

0686.wav:EN
0687.wav:EN
0688.wav:FR
0689.wav:FR
0690.wav:EN
0691.wav:FR
0692.wav:AR
0693.wav:FR
0694.wav:AR
0695.wav:EN
0696.wav:JP
0697.wav:FR
0698.wav:FR
0699.wav:FR
0700.wav:FR
0701.wav:FR
0702.wav:FR
0703.wav:JP
0704.wav:FR
0705.wav:EN
0706.wav:FR
0707.wav:AR
0708.wav:EN
0709.wav:JP
0710.wav:AR
0711.wav:FR
0712.wav:FR
0713.wav:AR
0714.wav:JP
0715.wav:EN
0716.wav:FR
0717.wav:FR
0718.wav:EN
0719.wav:FR
0720.wav:FR
0721.wav:JP
0722.wav:EN
0723.wav:AR
0724.wav:EN
0725.wav:FR
0726.wav:FR
0727.wav:FR
0728.wav:FR
0729.wav:FR
0730.wav:FR
0731.wav:FR
0732.wav:FR
0733.wav:FR
0734.wav:FR
0735.wav:FR
0736.wav:AR
0737.wav:EN
0738.wav:FR
0739.wav:FR
0740.wav:EN
0741.wav:EN
0742.wav:FR
0743.wav:FR
0744.wav:EN
0745.wav:EN
0746.wav:FR
0747.wav:FR
0748.wav:AR
0749.wav:FR
0750.wav:AR
0751.wav:FR
0752.wav:FR
0753.wav:FR
0754.wav:AR
0755.wav:AR
0756.wav:FR
0757.wav:EN
0758.wav:FR
0759.wav:EN
0760.wav:JP
0761.wav:JP
0762.wav:EN
0763.wav:JP
0764.wav:FR
0765.wav:FR
0766.wav:FR
0767.wav:JP
0768.wav:FR
0769

In [50]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score

# Différentes stratégies
strategies = ["most_frequent", "stratified", "uniform"]
results = {}

for strategy in strategies:
    dummy = DummyClassifier(strategy=strategy, random_state=42)
    dummy.fit(X_train, y_train)
    y_pred_dummy = dummy.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_dummy)
    f1 = f1_score(y_test, y_pred_dummy, average='weighted')
    
    results[strategy] = {"Accuracy": accuracy, "F1-Score": f1}
    print(f"Strategy: {strategy}")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print("-" * 30)

# Afficher le meilleur score
best_strategy = max(results, key=lambda x: results[x]["Accuracy"])
print(f"Best Dummy Strategy: {best_strategy} with Accuracy {results[best_strategy]['Accuracy']:.4f}")

Strategy: most_frequent
  Accuracy: 0.2564
  F1-Score: 0.1047
------------------------------
Strategy: stratified
  Accuracy: 0.2462
  F1-Score: 0.2466
------------------------------
Strategy: uniform
  Accuracy: 0.2308
  F1-Score: 0.2303
------------------------------
Best Dummy Strategy: most_frequent with Accuracy 0.2564


In [52]:

#set data_dir to the directory of your data files
data_dir= "Test_Set/"

#Change below file name:
ourputfile_name="YONKOUA_INS_NAWEL_Dimmy_2.0v1"

# Read file info file to get the list of audio files and their labels
file_list=[]
label_list=[]
with open(data_dir+"Info.csv", 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # The first column contains the file name
        file_list.append(row[0])

lang_dic={'EN':0,'FR':1,'AR':2,'JP':3}
class2lang_dic={0:"EN",1:"FR",2:"AR",3:"JP"}
with open(data_dir+f"{ourputfile_name}.csv",'w') as file:
    file.write(f"ID,Label\n")
for test_sample in file_list[1:]:
    test_sample_feature=feature_extractor_1(data_dir+test_sample)
    predicted=class2lang_dic[dummy.predict([test_sample_feature])[0]]
    print(f'{test_sample}:{predicted}')
    # save the predicted output in Output_evaluation.txt
    with open(data_dir+f"{ourputfile_name}.csv",'a+') as file:
        file.write(f"{test_sample},{predicted}\n")

0000.wav:AR
0001.wav:AR
0002.wav:AR
0003.wav:AR
0004.wav:AR
0005.wav:AR
0006.wav:AR
0007.wav:AR
0008.wav:AR
0009.wav:AR
0010.wav:AR
0011.wav:AR
0012.wav:AR
0013.wav:AR
0014.wav:AR
0015.wav:AR
0016.wav:AR
0017.wav:AR
0018.wav:AR
0019.wav:AR
0020.wav:AR
0021.wav:AR
0022.wav:AR
0023.wav:AR
0024.wav:AR
0025.wav:AR
0026.wav:AR
0027.wav:AR
0028.wav:AR
0029.wav:AR
0030.wav:AR
0031.wav:AR
0032.wav:AR
0033.wav:AR
0034.wav:AR
0035.wav:AR
0036.wav:AR
0037.wav:AR
0038.wav:AR
0039.wav:AR
0040.wav:AR
0041.wav:AR
0042.wav:AR
0043.wav:AR
0044.wav:AR
0045.wav:AR
0046.wav:AR
0047.wav:AR
0048.wav:AR
0049.wav:AR
0050.wav:AR
0051.wav:AR
0052.wav:AR
0053.wav:AR
0054.wav:AR
0055.wav:AR
0056.wav:AR
0057.wav:AR
0058.wav:AR
0059.wav:AR
0060.wav:AR
0061.wav:AR
0062.wav:AR
0063.wav:AR
0064.wav:AR
0065.wav:AR
0066.wav:AR
0067.wav:AR
0068.wav:AR
0069.wav:AR
0070.wav:AR
0071.wav:AR
0072.wav:AR
0073.wav:AR
0074.wav:AR
0075.wav:AR
0076.wav:AR
0077.wav:AR
0078.wav:AR
0079.wav:AR
0080.wav:AR
0081.wav:AR
0082.wav:AR
0083

0683.wav:AR
0684.wav:AR
0685.wav:AR
0686.wav:AR
0687.wav:AR
0688.wav:AR
0689.wav:AR
0690.wav:AR
0691.wav:AR
0692.wav:AR
0693.wav:AR
0694.wav:AR
0695.wav:AR
0696.wav:AR
0697.wav:AR
0698.wav:AR
0699.wav:AR
0700.wav:AR
0701.wav:AR
0702.wav:AR
0703.wav:AR
0704.wav:AR
0705.wav:AR
0706.wav:AR
0707.wav:AR
0708.wav:AR
0709.wav:AR
0710.wav:AR
0711.wav:AR
0712.wav:AR
0713.wav:AR
0714.wav:AR
0715.wav:AR
0716.wav:AR
0717.wav:AR
0718.wav:AR
0719.wav:AR
0720.wav:AR
0721.wav:AR
0722.wav:AR
0723.wav:AR
0724.wav:AR
0725.wav:AR
0726.wav:AR
0727.wav:AR
0728.wav:AR
0729.wav:AR
0730.wav:AR
0731.wav:AR
0732.wav:AR
0733.wav:AR
0734.wav:AR
0735.wav:AR
0736.wav:AR
0737.wav:AR
0738.wav:AR
0739.wav:AR
0740.wav:AR
0741.wav:AR
0742.wav:AR
0743.wav:AR
0744.wav:AR
0745.wav:AR
0746.wav:AR
0747.wav:AR
0748.wav:AR
0749.wav:AR
0750.wav:AR
0751.wav:AR
0752.wav:AR
0753.wav:AR
0754.wav:AR
0755.wav:AR
0756.wav:AR
0757.wav:AR
0758.wav:AR
0759.wav:AR
0760.wav:AR
0761.wav:AR
0762.wav:AR
0763.wav:AR
0764.wav:AR
0765.wav:AR
0766

In [53]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split des données
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

# Définir les modèles optimisés
rf = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42)
svm = make_pipeline(StandardScaler(), SVC(C=10, kernel='rbf', probability=True, random_state=42))
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100, 50), alpha=1e-4, max_iter=500, random_state=42))

# Créer le VotingClassifier (vote majoritaire)
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('svm', svm),
        ('mlp', mlp)
    ],
    voting='hard'  # 'soft' si tu veux voter par probabilité (il faut alors probability=True dans SVC)
)

# Entraîner le VotingClassifier
voting_clf.fit(x_train, y_train)

# Prédictions sur le jeu test
y_pred = voting_clf.predict(x_test)

# Évaluer la précision
print("Accuracy VotingClassifier :", accuracy_score(y_test, y_pred))

Accuracy VotingClassifier : 0.6440677966101694
