In [None]:
import pandas as pd
import numpy as np
import sklearn
from tensorflow.keras.utils import to_categorical
import seaborn as sns
import matplotlib.pyplot as plt
import IPython.display as ipd
import tensorflow as tf
import librosa
import os
import soundfile as sf
import torch, torchaudio
import torchaudio.functional as f
import re
from scipy.signal import resample
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount('/content/drive')

# กำหนด path ตามโครงสร้างโฟลเดอร์ของคุณ
dir = '/content/drive/My Drive/archive/recordings/recordings'

import os
audio_list = os.listdir(dir)
print('Dataset length:', len(audio_list))

# ตรวจสอบว่าได้ไฟล์มาถูกต้องไหม
print('\nFirst few files:')
print(audio_list[:5])  # แสดง 5 ไฟล์แรก
df = pd.DataFrame()
df['speech'] = audio_list

labels = [re.sub(r'\d+\.mp3$', '', audio) for audio in audio_list]

df = pd.DataFrame({
    'speech': audio_list,
    'labels': labels
})

df.head()

# count the all label
df['labels'].value_counts()
# data analysis of all label
sns.countplot(x = df['labels'], data=df)

target_sample_rate = 44100  # Target sample rate for the audio

def extract_features(audio_file, n_fft=128, hop_length=32, n_mfcc=13):
    # Read the audio file using soundfile library
    y, sr = sf.read(audio_file)

    # Check if the sample rate is different from the target sample rate
    if sr != target_sample_rate:
        # Resample the audio to the target sample rate
        y = resample(y, int(len(y) * target_sample_rate / sr))
        sr = target_sample_rate

    # Extract the first 10 seconds of audio (assuming the file is long enough)
    samples_10_sec = target_sample_rate * 5   # 10 seconds of audio at the target sample rate
    y_10_sec = y[:samples_10_sec]  # Slice the audio to get the first 10 seconds

    # Extract Mel-Frequency Cepstral Coefficients (MFCCs) from the audio
    mfccs = librosa.feature.mfcc(y = y_10_sec, sr=sr, n_mfcc=n_mfcc)

    # Normalize the MFCCs (mean = 0, standard deviation = 1)
    mfccs_normalized = (mfccs - np.mean(mfccs)) / np.std(mfccs)

    return mfccs  # Return the MFCC features
    data = []  # Empty list to store audio features and labels

# Loop through each audio file in the audio_list
for audio in audio_list:
    # Create the full path to the audio file
    audio_path = dir + "/" + audio

    # Extract features from the audio file (using the extract_features function)
    feature = extract_features(audio_path)

    # Create the label by removing numbers from the file name and removing the .mp3 extension
    label = re.sub(r'\d+', '', audio[:-4])  # Remove digits from the file name using regex

    # Append the feature and label as a tuple to the data list
    data.append((feature, label))

    separated_data = []  # List to store dictionaries with MFCC features for each audio
label_arr = []  # List to store the labels for each audio file

# Loop through each feature-label pair in the data
for f, l in data:
    # Create a dictionary of MFCCs by calculating the mean of each MFCC coefficient across time
    # The dictionary key is the MFCC name (e.g., 'MFCC_1', 'MFCC_2', etc.)
    mfcc_dict = {f'MFCC_{i+1}': np.mean(f[i]) for i in range(f.shape[0])}

    # Append the label to the label_arr list
    label_arr.append(l)

    # Append the dictionary of MFCCs to the separated_data list
    separated_data.append(mfcc_dict)

    # Convert the list of MFCC dictionaries into a Pandas DataFrame
df_new = pd.DataFrame(separated_data)

# Add the labels as a new column in the DataFrame
df_new['label'] = label_arr

# Display the first few rows of the DataFrame to check the data
df_new.head()

# Print the length of the DataFrame before removing missing values
print(len(df_new))

# Remove rows with missing (null) values from the DataFrame
df_cleaned = df_new.dropna()

# Print the length of th

# Get the count of each unique value (accent label) in the 'label' column
accent_counts = df_cleaned['label'].value_counts()

# Print the counts of each accent label
print(accent_counts)

# Find the accent label with the most samples (largest group)
largest_accent = accent_counts.idxmax()

# Get the count of the largest group (most samples)
max_count = accent_counts.max()

# Initialize an empty list to store the oversampled data
oversampled_data = []

# Loop through each accent label and its count
for accent, count in accent_counts.items():
    # If the accent is not the largest group and has more than 30 samples
    if accent != largest_accent and count > 30:
        # Get the data for the current accent
        accent_data = df_cleaned[df_cleaned['label'] == accent]

        # Oversample the accent group to match the size of the largest group
        oversampled_accent = accent_data.sample(n=max_count, replace=True, random_state=42)

        # Append the oversampled data for this accent
        oversampled_data.append(oversampled_accent)
    # If the accent is the largest group, do not oversample, just append it as is
    elif accent == largest_accent:
        oversampled_data.append(df_cleaned[df_cleaned['label'] == accent])

# Combine the oversampled data to form a new balanced DataFrame
balanced_df = pd.concat(oversampled_data)
# Shuffle the rows of the balanced DataFrame to randomize the order of the samples
df_shuffled = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the shuffled DataFrame
df_shuffled.head()
# Separate the target variable (label) into y
y = df_shuffled["label"]

# Separate the features (all columns except 'label') into x
x = df_shuffled.drop('label', axis=1)

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder to encode the labels
label_encoder = LabelEncoder()

# Fit and transform the target labels (y) into encoded labels
encoded_labels = label_encoder.fit_transform(y)

# Create a dictionary to map original labels to encoded labels
label_mapping = {label: encoded_label for label, encoded_label in zip(y, encoded_labels)}

# Print the mapping of labels to encoded values
class_label = []  # Initialize an empty list to store the original labels
print("Label mapping:")
for label, encoded_label in label_mapping.items():
    class_label.append(label)  # Add original label to the list
    print(f"{label}: {encoded_label}")  # Print the original label and its corresponding encoded value

from sklearn.model_selection import train_test_split

# Convert the feature columns (excluding 'label') from the shuffled DataFrame into a NumPy array
X = np.array(df_shuffled.drop('label', axis=1).values)

# Extract the target labels ('label' column) from the shuffled DataFrame into a NumPy array
y = df_shuffled['label'].values

# Initialize the LabelEncoder to convert categorical labels into numeric values
label_encoder = LabelEncoder()

# Fit the encoder to the labels and transform the labels into numeric values (encoded labels)
y_encoded = label_encoder.fit_transform(y)

# Convert the numeric labels into one-hot encoded format (binary matrix representation)
y_onehot = to_categorical(y_encoded)

# Split the data into training and testing sets (80% for training, 20% for testing)
# The features (X) and one-hot encoded labels (y_onehot) are split
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)




import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, MaxPooling1D

# Function to build the CNN model for accent classification
def build_cnn_model(input_shape):
    # Initialize the Sequential model (a linear stack of layers)
    model = Sequential()

    # Add the first 1D convolutional layer with 32 filters, a kernel size of 3, and ReLU activation function
    # input_shape is the shape of each input sample (number of features, 1 for a single channel)
    model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape))

    # Add a MaxPooling layer with pool size of 2 to downsample the output of the previous layer
    model.add(MaxPooling1D(pool_size=2))

    # Add the second 1D convolutional layer with 64 filters and kernel size of 3, using ReLU activation
    model.add(Conv1D(64, 3, activation='relu'))

    # Add another MaxPooling layer to further downsample the feature map
    model.add(MaxPooling1D(2))

    # Flatten the 2D output from the last MaxPooling layer into a 1D vector
    model.add(Flatten())

    # Add a fully connected (Dense) layer with 128 neurons and ReLU activation function
    model.add(Dense(128, activation='relu'))

    # Add Dropout layer to reduce overfitting by randomly setting 50% of the input units to zero during training
    model.add(Dropout(0.5))

    # Add the final output layer with as many neurons as there are classes (y_train.shape[1]) and softmax activation
    # Softmax is used for multi-class classification, converting the outputs into probabilities
    model.add(Dense(y_train.shape[1], activation='softmax'))

    # Compile the model with Adam optimizer, categorical crossentropy loss function, and accuracy metric
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

    # Return the built model
    return model

# Define the input shape for the model: (number of features, 1) for a single channel (e.g., MFCC features)
input_shape = (X_train.shape[1], 1)

# Build the CNN model using the defined input shape
model = build_cnn_model(input_shape)

# Train the model using the training data (X_train, y_train) and validate it using the test data (X_test, y_test)
# The model will train for 150 epochs with a batch size of 32, and display progress at the end of each epoch
history = model.fit(X_train, y_train, epochs=150, batch_size=32, validation_data=(X_test, y_test), verbose=2)

import matplotlib.pyplot as plt

# Get the metrics from the history object
metrics = history.history

# Create a figure with smaller subplots placed side by side
fig, axs = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns

# Plot Accuracy
axs[0].plot(history.epoch, 100 * np.array(metrics['accuracy']), 100 * np.array(metrics['val_accuracy']))
axs[0].set_title("Accuracy CNN")
axs[0].set_ylim([0, 100])
axs[0].set_xlabel("Epochs")
axs[0].set_ylabel("Accuracy")
axs[0].legend(['accuracy', 'val_accuracy'])

# Plot Loss
axs[1].plot(history.epoch, np.array(metrics['loss']), np.array(metrics['val_loss']))
axs[1].set_title("Loss CNN")
axs[1].set_xlabel("Epochs")
axs[1].set_ylabel("Loss")
axs[1].legend(['loss', 'val_loss'])

# Display the plots
plt.tight_layout()
plt.show()
import numpy as np

# Get the predicted probabilities for each class from the model
y_pred_probs = model.predict(X_test)

# Convert the predicted probabilities into class labels by selecting the index with the highest probability
y_pred = np.argmax(y_pred_probs, axis=1)
# Convert the one-hot encoded true labels (y_test) into class labels by selecting the index with the 1 in each one-hot vector
y_true = np.argmax(y_test, axis=1)
from sklearn.metrics import classification_report

# Define the list of class names corresponding to the labels
class_names = [
    "Arabic", "Dutch", "English", "French", "German",
    "Italian", "Korean", "Mandarin", "Polish", "Portuguese",
    "Russian", "Spanish", "Turkish"
]

# Generate the classification report, which provides metrics like precision, recall, f1-score
# for each class in the dataset
report = classification_report(y_true, y_pred, target_names=class_names)

# Print the classification report to see the evaluation results
print(report)
# Save the trained model to a file
model.save('cnn.h5')

# Load the saved model from the file
from tensorflow.keras.models import load_model
model = load_model('cnn.h5')
model.save('cnn_model.keras')  # TensorFlow will use the SavedModel format by default

import os
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras_tuner import HyperModel
import tensorflow as tf
from joblib import load
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# Define base paths
BASE_PATH = 'drive/MyDrive'
MODEL_PATH = os.path.join(BASE_PATH, 'models')
TUNING_PATH = os.path.join(BASE_PATH, 'model_tuning')

# Create directories if they don't exist
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(TUNING_PATH, exist_ok=True)

class CNNHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes

    def build(self, hp):
        model = Sequential()

        model.add(Conv1D(
            filters=hp.Int('filters1', min_value=32, max_value=128, step=32),
            kernel_size=hp.Int('kernel_size1', min_value=2, max_value=5, step=1),
            activation='relu',
            padding='same',
            input_shape=self.input_shape
        ))
        model.add(MaxPooling1D(pool_size=2))

        model.add(Conv1D(
            filters=hp.Int('filters2', min_value=64, max_value=256, step=32),
            kernel_size=hp.Int('kernel_size2', min_value=2, max_value=5, step=1),
            activation='relu',
            padding='same'
        ))
        model.add(MaxPooling1D(pool_size=2))

        model.add(Flatten())

        model.add(Dense(
            hp.Int('dense_units', min_value=64, max_value=256, step=64),
            activation='relu'
        ))

        model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))

        model.add(Dense(self.num_classes, activation='softmax'))

        model.compile(
            optimizer=Adam(
                hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
            ),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopper = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=1,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    verbose=1,
    min_lr=1e-6
)

from kerastuner.tuners import RandomSearch

input_shape = (X_train.shape[1], 1)
num_classes = y_train.shape[1]

hypermodel = CNNHyperModel(input_shape=input_shape, num_classes=num_classes)

tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=2,
    directory=TUNING_PATH,
    project_name='AccentClassification'
)

tuner.search(x=X_train, y=y_train, epochs=50, validation_data=(X_test, y_test),
             callbacks=[early_stopper, reduce_lr])
best_model = tuner.get_best_models(num_models=1)[0]
loss, accuracy = best_model.evaluate(X_test, y_test)
print("Best model accuracy: {:.2f}%".format(accuracy * 100))

def create_cnn_model(input_shape, num_classes):
    model = Sequential()

    model.add(Conv1D(filters=32, kernel_size=4, activation='relu', padding='same',
                    input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))

    model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer=Adam(learning_rate=0.0011876),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

    return model

best_model = create_cnn_model(input_shape, num_classes)

history = best_model.fit(X_train, y_train, epochs=150, batch_size=32,
                        validation_data=(X_test, y_test), verbose=2)

# Plotting
metrics = history.history
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

axs[0].plot(history.epoch, 100 * np.array(metrics['accuracy']),
            100 * np.array(metrics['val_accuracy']))
axs[0].set_title("Accuracy CNN")
axs[0].set_ylim([0, 100])
axs[0].set_xlabel("Epochs")
axs[0].set_ylabel("Accuracy")
axs[0].legend(['accuracy', 'val_accuracy'])

axs[1].plot(history.epoch, np.array(metrics['loss']), np.array(metrics['val_loss']))
axs[1].set_title("Loss CNN")
axs[1].set_xlabel("Epochs")
axs[1].set_ylabel("Loss")
axs[1].legend(['loss', 'val_loss'])

plt.tight_layout()
plt.show()

# Predictions and evaluation
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

class_names = [
    "Arabic", "Dutch", "English", "French", "German",
    "Italian", "Korean", "Mandarin", "Polish", "Portuguese",
    "Russian", "Spanish", "Turkish"
]

report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

# Save models
model_h5_path = os.path.join(MODEL_PATH, 'cnn_tunning.h5')
model_keras_path = os.path.join(MODEL_PATH, 'cnn_tunning_keras.keras')

best_model.save(model_h5_path)
best_model.save(model_keras_path)

# Load and verify model
loaded_model = tf.keras.models.load_model(model_h5_path)
loss, accuracy = loaded_model.evaluate(X_test, y_test)
print("Loaded model accuracy:", accuracy)

def preprocess_audio(file_path, target_sample_rate=44100, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=target_sample_rate)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    mfcc = mfcc.T

    if mfcc.shape[0] > 13:
        mfcc = mfcc[:13, :]
    else:
        mfcc = np.pad(mfcc, ((0, 13 - mfcc.shape[0]), (0, 0)), mode='constant')
    mfcc = np.mean(mfcc, axis=1, keepdims=True)
    mfcc = np.expand_dims(mfcc, axis=0)

    return mfcc

# Function to load model
def load_model(model_name):
    if model_name.endswith('.h5'):
        return tf.keras.models.load_model(model_name)
    else:
        return load(f'{model_name}.joblib')

# Example usage
audio_file_path = os.path.join(BASE_PATH, 'audio', 'test.mp3')  # Update with your audio path
audio_data = preprocess_audio(audio_file_path)
print("Shape of preprocessed audio data:", audio_data.shape)

model = load_model(model_h5_path)
predictions = model.predict(audio_data)
predicted_class = np.argmax(predictions, axis=1)
print(f"The predicted class for the audio file is: {class_names[predicted_class[0]]}")