**1. Genre**

In [None]:
import os
import json
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import librosa
import torch

# Genre mapping
genre_mapping = {
    0: "Electronic",
    1: "Rock",
    2: "Punk",
    3: "Experimental",
    4: "Hip-Hop",
    5: "Folk",
    6: "Chiptune / Glitch",
    7: "Instrumental",
    8: "Pop",
    9: "International",
}

# Load model and feature extractor using Facebook Wav2vec2 and gastondault Music-classifier
model = Wav2Vec2ForSequenceClassification.from_pretrained("gastonduault/music-classifier")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large")

# Function for preprocessing audio
def preprocess_audio(audio_path):
    try:
        audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
        return feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Function for predicting genre
def predict_genre(audio_path):
    inputs = preprocess_audio(audio_path)
    if inputs is None:
        return None
    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class = torch.argmax(logits, dim=-1).item()
        confidence = torch.softmax(logits, dim=-1)[0, predicted_class].item()
    return genre_mapping[predicted_class], confidence

# Directory of audio samples
audio_directory = "/content/rhythm"  # Replace with your directory path
output_results = []

# Iterate over all .wav files in the directory
for filename in os.listdir(audio_directory):
    if filename.endswith(".wav"):
        audio_path = os.path.join(audio_directory, filename)
        print(f"Processing: {filename}")
        result = predict_genre(audio_path)
        if result:
            genre, confidence = result
            output_results.append({
                "filename": filename,
                "genre": genre,
                "confidence": f"{confidence:.2%}"
            })
            print(f"Predicted genre: {genre} (Confidence: {confidence:.2%})")

# Save results to a JSON file
output_file = os.path.join(audio_directory, "genre_predictions.json")
with open(output_file, "w") as f:
    json.dump(output_results, f, indent=4)

print(f"Processed {len(output_results)} files. Results saved to {output_file}.")


Processing: Rhythm Machine_segment_2.wav
Predicted genre: Electronic (Confidence: 74.95%)
Processing: Rhythm Machine_segment_4.wav
Predicted genre: International (Confidence: 64.78%)
Processing: Rhythm Machine_segment_5.wav
Predicted genre: Electronic (Confidence: 73.78%)
Processing: Rhythm Machine_segment_6.wav
Predicted genre: Electronic (Confidence: 67.87%)
Processing: Rhythm Machine_segment_1.wav
Predicted genre: Electronic (Confidence: 73.83%)
Processing: Rhythm Machine_segment_3.wav
Predicted genre: Electronic (Confidence: 72.13%)
Processing: Rhythm Machine_segment_7.wav
Predicted genre: Experimental (Confidence: 34.56%)
Processed 7 files. Results saved to /content/rhythm/genre_predictions.json.


**2. Lyrics**

In [None]:
!apt-get install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
!pip install openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m5.4 MB/s[0m eta [36m0:0

In [None]:
import whisper
import os
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load the Whisper model (use a model like "base" or larger for more accuracy)
model = whisper.load_model("medium")

# Directory containing audio files
audio_dir = "/content/rhythm"

def transcribe_music(audio_path):
    # Transcribe the audio file
    result = model.transcribe(audio_path)

    # Check if there is any transcribed text
    if result["text"].strip() == "":
        return "Lyrics are not provided."
    else:
        return result["text"]

def transcribe_all_audio_in_directory(directory_path):
    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)

        # Check if the file is an audio file (optional: check for file extension)
        if file_path.endswith((".mp3", ".wav", ".flac", ".ogg")):
            print(f"Transcribing {filename}...")
            transcription = transcribe_music(file_path)
            print(f"Transcription for {filename}:")
            print(transcription)
            print("-" * 50)

# Apply the transcription process to all audio files in the directory
transcribe_all_audio_in_directory(audio_dir)

100%|█████████████████████████████████████| 1.42G/1.42G [00:18<00:00, 84.2MiB/s]


Transcribing Rhythm Machine_segment_2.wav...
Transcription for Rhythm Machine_segment_2.wav:
 A love addiction, my intuition A real sensation, an activation You have a mission, cause my ignition A heart admission, intoxication
--------------------------------------------------
Transcribing Rhythm Machine_segment_4.wav...
Transcription for Rhythm Machine_segment_4.wav:
 You light my fire, my desire, a real feeling, one with meaning You have permission, push my ignition, disarm the system, caught in the rhythm The rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rhythm, the rh

**3. Mood / Emotions:**

In [None]:
!pip install transformers librosa numpy




In [None]:
import os
import json
from transformers import pipeline

# Load the audio classification pipeline
classifier = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")

# Path to your folder containing audio files
audio_folder = "/content/rhythm"

# List to store classification results
results_metadata = []

# Iterate over all WAV files in the folder
for audio_file in os.listdir(audio_folder):
    if audio_file.endswith(".wav"):  # Check for WAV files
        audio_path = os.path.join(audio_folder, audio_file)
        try:
            # Perform classification
            result = classifier(audio_path)
            # Append the result as metadata
            results_metadata.append({
                "file_name": audio_file,
                "predictions": result
            })
            print(f"Processed {audio_file}")
        except Exception as e:
            print(f"Error processing {audio_file}: {e}")

# Path to save the metadata JSON file
output_json_path = "/content/output/mood_classification_metadata.json"

# Save results to JSON file
with open(output_json_path, "w") as json_file:
    json.dump(results_metadata, json_file, indent=4)

print(f"Metadata saved to {output_json_path}")


Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

Processed Rhythm Machine_segment_2.wav
Processed Rhythm Machine_segment_4.wav
Processed Rhythm Machine_segment_5.wav
Processed Rhythm Machine_segment_6.wav
Processed Rhythm Machine_segment_1.wav
Processed Rhythm Machine_segment_3.wav
Processed Rhythm Machine_segment_7.wav
Metadata saved to /content/output/mood_classification_metadata.json


**4. Tempo / BPM or Key**


In [None]:
# Musical note mapping based on chroma feature indices
note_mapping = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

# Function to extract BPM and key
def extract_key_bpm(audio_file):
    y, sr = librosa.load(audio_file, sr=32000)

    # Extract tempo (BPM)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    # Extract chroma feature and compute the mean of the key
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

    # Find the index of the maximum value in the chroma feature
    key_index = chroma.mean(axis=1).argmax()  # Get index of max value across all time frames
    key = note_mapping[key_index]  # Map index to corresponding musical note

    # Return tempo and key
    return tempo, key

# List to store extracted features
extracted_features = []

# Generate and collect metadata for training files
train_dir = '/content/rhythm'
for file in os.listdir(train_dir):
    if file.endswith('.wav'):
        file_path = os.path.join(train_dir, file)
        bpm, key = extract_key_bpm(file_path)
        extracted_features.append([file, bpm, key])




# Assuming extracted_features is a list
for feature in extracted_features:
    print(feature)

['Rhythm Machine_segment_2.wav', array([129.31034483]), 'D#']
['Rhythm Machine_segment_4.wav', array([129.31034483]), 'D#']
['Rhythm Machine_segment_5.wav', array([129.31034483]), 'D#']
['Rhythm Machine_segment_6.wav', array([129.31034483]), 'D#']
['Rhythm Machine_segment_1.wav', array([129.31034483]), 'D#']
['Rhythm Machine_segment_3.wav', array([129.31034483]), 'D#']
['Rhythm Machine_segment_7.wav', array([129.31034483]), 'D#']


In [None]:
finalimport os
import json
import librosa

# Musical note mapping based on chroma feature indices
note_mapping = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

# Function to extract BPM and key
def extract_key_bpm(audio_file):
    y, sr = librosa.load(audio_file, sr=32000)

    # Extract tempo (BPM)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    # Extract chroma feature and compute the mean of the key
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

    # Find the index of the maximum value in the chroma feature
    key_index = chroma.mean(axis=1).argmax()  # Get index of max value across all time frames
    key = note_mapping[key_index]  # Map index to corresponding musical note

    # Convert tempo to float to ensure JSON serialization
    return float(tempo), key

# List to store extracted features
extracted_features = []

# Generate and collect metadata for training files
train_dir = '/content/rhythm'  # Replace with your directory path
for file in os.listdir(train_dir):
    if file.endswith('.wav'):
        file_path = os.path.join(train_dir, file)
        bpm, key = extract_key_bpm(file_path)
        extracted_features.append({
            "filename": file,
            "tempo": f"{bpm:.1f} BPM",  # Format tempo with "BPM"
            "key": key
        })

# Save extracted features to a JSON file
output_file = os.path.join(train_dir, "extracted_features.json")
with open(output_file, "w") as f:
    json.dump(extracted_features, f, indent=4)

print(f"Extracted features saved to {output_file}")

Extracted features saved to /content/rhythm/extracted_features.json
