In [43]:
import os
import tarfile
import gzip
import shutil
import glob

extracted_dir = "extracted_data"

os.makedirs(extracted_dir, exist_ok=True)
# Check if the extraction was successful
extracted_files = glob.glob(os.path.join(extracted_dir, '**/*'), recursive=True)
if len(extracted_files) > 0:
    print(f"Extraction successful! {len(extracted_files)} files found.")
else:
    print("Extraction failed or no files found.")

Extraction successful! 46570 files found.


In [44]:
%pip install librosa numpy

Note: you may need to restart the kernel to use updated packages.


In [45]:
import librosa
import numpy as np
import glob
import os

# Function to load and preprocess audio files
def load_and_preprocess_audio(file_paths):
    features, labels = [], []
    for file_path in file_paths:
        try:
            # Load the audio file
            audio, sr = librosa.load(file_path, sr=None)
            # Extract MFCC features
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
            # Take the mean of the MFCCs over time
            features.append(np.mean(mfccs, axis=1))
            # Extract label from the file path (assuming it's in the parent directory name)
            label = os.path.basename(os.path.dirname(file_path))
            labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    return np.array(features), np.array(labels)

# Collect all audio file paths
file_paths = glob.glob(os.path.join(extracted_dir, '**/*.mp3'), recursive=True)  # Adjust extension if needed

# Load and preprocess the audio data
features, labels = load_and_preprocess_audio(file_paths)

# Check if data is loaded and preprocessed correctly
if len(features) > 0 and len(labels) > 0:
    print(f"Data loading and preprocessing successful! {len(features)} samples loaded.")
else:
    print("Data loading failed or no features extracted.")

Data loading and preprocessing successful! 46560 samples loaded.


In [46]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Check label encoding
print("Labels encoded successfully.")

Labels encoded successfully.


In [47]:
%pip install keras
%pip install tensorflow 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [48]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Split the data into training and testing sets
if len(features) > 1:
    X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.2, random_state=42)

    # Convert labels to categorical (one-hot encoding)
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    print(f"Data split successful: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples.")
else:
    print("Insufficient data to perform a train-test split.")

Data split successful: 37248 training samples, 9312 test samples.


In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization

# Reshape the features to have 3 dimensions: (samples, timesteps, features)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Define the improved 1D CNN model
model = Sequential()

# First Conv1D layer with Batch Normalization
model.add(Conv1D(64, 3, padding='same', activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

# Second Conv1D layer
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

# Third Conv1D layer
model.add(Conv1D(256, 3, padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

# Flatten the feature maps to feed into dense layers
model.add(Flatten())

# Fully connected dense layer with dropout
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(len(np.unique(encoded_labels)), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [50]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
[1m1164/1164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 17ms/step - accuracy: 0.6177 - loss: 1.2948 - val_accuracy: 0.8240 - val_loss: 0.5761
Epoch 2/20
[1m1164/1164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 15ms/step - accuracy: 0.8158 - loss: 0.5919 - val_accuracy: 0.8735 - val_loss: 0.4196
Epoch 3/20
[1m1164/1164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 14ms/step - accuracy: 0.8673 - loss: 0.4261 - val_accuracy: 0.9009 - val_loss: 0.3265
Epoch 4/20
[1m1164/1164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 15ms/step - accuracy: 0.8910 - loss: 0.3444 - val_accuracy: 0.9075 - val_loss: 0.3052
Epoch 5/20
[1m1164/1164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 15ms/step - accuracy: 0.9123 - loss: 0.2808 - val_accuracy: 0.9226 - val_loss: 0.2626
Epoch 6/20
[1m1164/1164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - accuracy: 0.9277 - loss: 0.2286 - val_accuracy: 0.9265 - val_loss: 0.2577
Epoc

In [51]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9531 - loss: 0.1853
Test Accuracy: 95.32%


In [52]:
# Save the model
model.save("language_detection_model.h5")

# Save the label encoder
import pickle

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model and label encoder saved successfully.")




Model and label encoder saved successfully.


In [53]:
from tensorflow.keras.models import load_model
import pickle

# Load the model and label encoder
model = load_model("language_detection_model.h5")

with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# Load and preprocess a new audio file
new_file_path = "/content/common_voice_te_40163657.mp3"
new_audio, sr = librosa.load(new_file_path, sr=None)
new_mfcc = librosa.feature.mfcc(y=new_audio, sr=sr, n_mfcc=40)
new_feature = np.mean(new_mfcc, axis=1).reshape(1, -1)

# Predict the language
prediction = model.predict(new_feature)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
print(f"Predicted Language: {predicted_label[0]}")

  new_audio, sr = librosa.load(new_file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '/content/common_voice_te_40163657.mp3'