<a href="https://colab.research.google.com/github/Ruchir1807/Web-Stuff/blob/main/moosicgenre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!mkdir -p ~/.kaggle
!pip install pydub




In [34]:
!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification


Dataset URL: https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification
License(s): other
gtzan-dataset-music-genre-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [35]:
import zipfile
zip_ref = zipfile.ZipFile('/content/gtzan-dataset-music-genre-classification.zip','r')
zip_ref.extractall('/content')
zip_ref.close()

In [45]:
from pydub import AudioSegment
import librosa
import numpy as np

file_name = '/content/Data/genres_original'
file_path = '/content/Data/genres_original/metal/metal.00001.wav'


def convert_wav_format(file_path):
    """Convert wav file to a format readable by librosa"""
    sound = AudioSegment.from_wav(file_path)
    sound = sound.set_frame_rate(22050).set_channels(1)
    new_file_path = file_path.replace(".wav", "_converted.wav")
    sound.export(new_file_path, format="wav")
    return new_file_path



def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') #res_type='kaiser_fast' specifies the resampling algorithm, which is a fast and efficient method.
        # Extracting MFCCs(Mel-Frequency Cepstral Coefficients) captures timbral aspects of sound
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)

        # Extracting Mel Spectrogram, A Mel Spectrogram represents the intensity of frequencies over time, mapped to the Mel scale, which is closer to human perception of sound.
        mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
        melsscaled = np.mean(mel.T, axis=0)

        # Extracting Chroma Feature,Chroma features represent the energy of each pitch class (like notes in music) over time.

        chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
        chroma_scaled = np.mean(chroma.T, axis=0)

        return np.hstack([mfccsscaled, melsscaled, chroma_scaled])
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None


In [42]:
import os
import pandas as pd

# Path to your audio files
data_dir = "/content/Data/genres_original"

# Prepare lists to hold features and labels
features_list = []
labels_list = []

# Loop over all files in your dataset
for genre in os.listdir(data_dir):
    genre_path = os.path.join(data_dir, genre)
    for file in os.listdir(genre_path):
        file_path = os.path.join(genre_path, file)
        features = extract_features(file_path)
        if features is not None:
            features_list.append(features)
            labels_list.append(genre)

# Convert to DataFrame for easier processing
features_df = pd.DataFrame(features_list)
labels_df = pd.DataFrame(labels_list, columns=['label'])

# Combine the features and labels into one DataFrame
data = pd.concat([features_df, labels_df], axis=1)

# Display the first few rows of the data
print(data.head())

Error encountered while parsing file: /content/Data/genres_original/reggae/smells-like-teen-spirit-_-nirvana-_-no-copyright-_-made-with-Voicemod.wav


  audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') #res_type='kaiser_fast' specifies the resampling algorithm, which is a fast and efficient method.
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error encountered while parsing file: /content/Data/genres_original/reggae/.ipynb_checkpoints
Error encountered while parsing file: /content/Data/genres_original/jazz/jazz.00054.wav
            0           1          2          3          4          5  \
0 -185.063400  120.512108  29.720230  56.372219  -3.028252  22.213934   
1  -24.671669   62.440269  -7.754089  28.432596  10.997148   5.551390   
2 -302.410675  150.116394  -3.713836  26.951956  -0.261415  13.506062   
3 -141.014435  123.921112  -2.510380  27.954958  -0.004282   1.210126   
4  -20.817461   64.907516   6.767101  26.357128  11.665423  13.185630   

           6          7          8         9  ...       171       172  \
0 -10.350042  20.053120 -13.679162  9.170144  ...  0.394442  0.257475   
1  -0.086744  13.745090   2.899817  9.909151  ...  0.366442  0.442675   
2   7.233895   4.227106   1.757852  6.571226  ...  0.400966  0.440800   
3 -11.015174  -1.365306  -7.019753 -6.129560  ...  0.216462  0.360627   
4  -1.979107  

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels_df['label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu')) #input layer
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))#hidden layer
model.add(Dropout(0.3))
model.add(Dense(10, activation='softmax'))#outer layer

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

model.save("genre_classification_model.keras") #saving the model for loading later
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

from tensorflow.keras.models import load_model

# Load the previously saved model
model1 = load_model("genre_classification_model.keras")

# Define a function to predict the genre of a new audio file
def predict_genre(file_path, model):
    # Extract features from the audio file
    features = extract_features(file_path)
    if features is not None:
        # Reshape features to match the input format of the classifier
        features = features.reshape(1, -1)
        # Predict the genre
        prediction = model.predict(features)
         # Find the index of the genre with the highest probability
        predicted_index = np.argmax(prediction)

        # List of genres corresponding to the indices
        genre_labels = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

        # Return the predicted genre
        return genre_labels[predicted_index]

    else:
        print("Failed to extract features from the audio file.")
        return None

# Path to the new audio file
new_audio_file = "/content/Data/genres_original/metal/metal.00001.wav"

# Predict the genre of the new audio file
predicted_genre = predict_genre(new_audio_file, model1)
print(f"Predicted Genre: {predicted_genre}")


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 107ms/step - accuracy: 0.1128 - loss: 24.7147 - val_accuracy: 0.2900 - val_loss: 5.2237
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2822 - loss: 8.9832 - val_accuracy: 0.3150 - val_loss: 2.9922
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2694 - loss: 4.8015 - val_accuracy: 0.3000 - val_loss: 2.2433
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2667 - loss: 2.9998 - val_accuracy: 0.3350 - val_loss: 2.0867
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2605 - loss: 2.7424 - val_accuracy: 0.2750 - val_loss: 2.0990
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2815 - loss: 2.6118 - val_accuracy: 0.3150 - val_loss: 1.9658
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━