# Recommender system that takes user mood, time and certain audio features into consideration

In [10]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [11]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU detected and enabled:", gpus)
    except RuntimeError as e:
        print("Error enabling GPU:", e)
else:
    print("No GPU detected. Running on CPU.")


GPU detected and enabled: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
# Adjust path as needed
DATA_PATH = './Data/cars_dataset.csv'
df = pd.read_csv(DATA_PATH)

# Quick look
df.head()

# %% [markdown]
## 3. Preprocessing

### 3.1 Encode Target (Genre)
# One-hot encode genres
genre_encoder = OneHotEncoder(sparse_output=False)
genre_ohe = genre_encoder.fit_transform(df[['genre']])

### 3.2 Numerical Features Scaling
# Scale numerical audio features
num_cols = ['popularity','release','danceability','energy','valence','tempo','duration_ms']
scaler = StandardScaler()
num_scaled = scaler.fit_transform(df[num_cols])

### 3.3 Categorical Context (Mood & Time of Day)
# One-hot encode user mood and time of day
mood_encoder = OneHotEncoder(sparse_output=False)
mood_ohe = mood_encoder.fit_transform(df[['mood']])

time_encoder = OneHotEncoder(sparse_output=False)
time_ohe = time_encoder.fit_transform(df[['time_of_day']])

### 3.4 Combine Features
# Audio and context features
X_audio = num_scaled
X_context = np.hstack([mood_ohe, time_ohe])

# Full feature matrix and target labels
X = np.hstack([X_audio, X_context])
y = genre_ohe
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df['genre']
)

model_dir = './saved_models/genre_recommender'
if os.path.exists(model_dir):
    model = tf.keras.models.load_model(model_dir)
    print("Loaded existing model from disk.")
else:
    print("No existing model found; training from scratch.")
    audio_inputs = layers.Input(shape=(X_audio.shape[1],), name='audio_input')
    context_inputs = layers.Input(shape=(X_context.shape[1],), name='context_input')

    # Audio branch
    x1 = layers.Dense(64, activation='relu')(audio_inputs)
    x1 = layers.Dropout(0.3)(x1)

    # Context branch
    x2 = layers.Dense(32, activation='relu')(context_inputs)
    x2 = layers.Dropout(0.3)(x2)

    # Fuse
    x = layers.Concatenate()([x1, x2])
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(genre_ohe.shape[1], activation='softmax', name='genre_output')(x)
    
    # Build and compile model
    model = models.Model(inputs=[audio_inputs, context_inputs], outputs=outputs)
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    model.summary()

# %% [markdown]
## 5. Train

history = model.fit(
    {'audio_input': X_train[:, :X_audio.shape[1]],
     'context_input': X_train[:, X_audio.shape[1]:]},
    y_train,
    validation_split=0.1,
    epochs=30,
    batch_size=32
)

# %% [markdown]
## 6. Evaluate

eval_results = model.evaluate(
    {'audio_input': X_test[:, :X_audio.shape[1]],
     'context_input': X_test[:, X_audio.shape[1]:]},
    y_test
)
print(f"Test Loss: {eval_results[0]:.4f}, Test Accuracy: {eval_results[1]:.4f}")

# save model
os.makedirs('./saved_models', exist_ok=True)
model.save('./saved_models/genre_recommender')
print("Model saved to ./saved_models/genre_recommender")

# %% [markdown]
## 7. Inference: Recommending a Genre for New Context

def recommend_genre(audio_feature_vec, user_mood, user_time_of_day, top_k=3):
    # Encode new sample
    audio_scaled = scaler.transform([audio_feature_vec])
    mood_vec = mood_encoder.transform([[user_mood]])
    time_vec = time_encoder.transform([[user_time_of_day]])
    context_vec = np.hstack([mood_vec, time_vec])
    
    # Predict genre probabilities
    probs = model.predict(
        {'audio_input': audio_scaled,
         'context_input': context_vec}
    )[0]
    top_idx = np.argsort(probs)[-top_k:][::-1]
    return [(genre_encoder.categories_[0][i], probs[i]) for i in top_idx]

# Example usage:
sample_audio = df.loc[0, num_cols].values
print(recommend_genre(sample_audio, user_mood='energetic', user_time_of_day='morning'))

Loaded existing model from disk.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 1.2161, Test Accuracy: 0.5624
INFO:tensorflow:Assets written to: ./saved_models/genre_recommender\assets
Model saved to ./saved_models/genre_recommender
[('pop', 0.7248019), ('disco', 0.24400248), ('rock', 0.010168788)]




# Export encoders

In [13]:
import pickle

# Save encoders
with open('./saved_models/genre_recommender/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('./saved_models/genre_recommender/mood_encoder.pkl', 'wb') as f:
    pickle.dump(mood_encoder, f)

with open('./saved_models/genre_recommender/time_encoder.pkl', 'wb') as f:
    pickle.dump(time_encoder, f)

with open('./saved_models/genre_recommender/genre_encoder.pkl', 'wb') as f:
    pickle.dump(genre_encoder, f)

print("Saved encoders to ./saved_models/genre_recommender/")

Saved encoders to ./saved_models/genre_recommender/


# Extracting required audio features for the recommender system

In [14]:
import librosa
import numpy as np
from scipy.stats import hmean

def extract_audio_features(path):
    # 1) Load audio
    y, sr = librosa.load(path, sr=22050, mono=True)
    duration = librosa.get_duration(y=y, sr=sr)  # in seconds

    # 2) Tempo (BPM) & Beat‐based features
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    # danceability ≈ normalized variance of inter‐beat interval
    ibis = np.diff(librosa.frames_to_time(beat_frames, sr=sr))
    danceability = 1.0 / (1.0 + np.std(ibis) / np.mean(ibis))

    # 3) Energy: average root‐mean‐square energy over frames
    rms = librosa.feature.rms(y=y)[0]
    energy = np.mean(rms)

    # 4) Valence proxy: spectral centroid & spectral contrast
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)[0]
    # combine into a “brightness” metric as a rough valence estimate
    valence = (np.mean(cent) / np.max(cent) + np.mean(contrast) / np.max(contrast)) / 2

    # 5) Key & Mode: via chroma
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
    chroma_avg = np.mean(chroma, axis=1)
    # simplest: pick the pitch class with highest energy
    key_idx = chroma_avg.argmax()
    # mode: compare averaged energy in minor thirds vs. major thirds
    # (this is very heuristic—better models exist!)
    # major third above key
    major_third = chroma_avg[(key_idx + 4) % 12]
    minor_third = chroma_avg[(key_idx + 3) % 12]
    mode = 1 if major_third > minor_third else 0

    return {
        'danceability': float(danceability),
        'energy':      float(energy),
        'valence':     float(valence),
        'tempo':       float(tempo),
        'key':         int(key_idx),
        'mode':        int(mode),
        'duration_ms': int(duration * 1000)
    }

# Example usage:
features = extract_audio_features('./Data/genres_original/blues/blues.00000.wav')
print(features)


{'danceability': 0.9776987920699817, 'energy': 0.1301843225955963, 'valence': 0.438683973870867, 'tempo': 123.046875, 'key': 7, 'mode': 1, 'duration_ms': 30013}


  'tempo':       float(tempo),


In [15]:
def display_available_categories():
    # Load the model first
    model = tf.keras.models.load_model('./saved_models/genre_recommender')
    
    # Load your data to recreate the encoders
    df = pd.read_csv('./Data/cars_dataset.csv')
    
    # Recreate encoders and fit them to get the categories
    mood_encoder = OneHotEncoder(sparse_output=False)
    mood_encoder.fit(df[['mood']])
    
    time_encoder = OneHotEncoder(sparse_output=False)
    time_encoder.fit(df[['time_of_day']])
    
    # Display the categories
    print("Available mood categories:")
    print(mood_encoder.categories_[0].tolist())
    print(f"Total number of mood categories: {len(mood_encoder.categories_[0])}")
    
    print("\nAvailable time of day categories:")
    print(time_encoder.categories_[0].tolist())
    print(f"Total number of time of day categories: {len(time_encoder.categories_[0])}")
    
    # Also check the expected shape of the context input
    context_input = model.get_layer('context_input')
    print(f"\nModel's context input shape: {context_input.input_shape}")
    print(f"Expected total number of context features: {context_input.input_shape[1]}")
    
    # For comparison with your current constants
    print("\nCurrent MOOD_MAP length:", len(MOOD_MAP))  # Your global constant
    print("Current TIME_MAP length:", len(TIME_MAP))    # Your global constant
    
    # Calculate expected vs actual context size
    expected_context_size = context_input.input_shape[1]
    actual_context_size = len(mood_encoder.categories_[0]) + len(time_encoder.categories_[0])
    
    if expected_context_size != actual_context_size:
        print(f"\nWARNING: Model expects {expected_context_size} context features, but encoders produce {actual_context_size} features")
        print("This mismatch is likely causing your error")

# Call this function to display the information
display_available_categories()

Available mood categories:
['angry', 'dreamy', 'emotional', 'energetic', 'happy', 'intense', 'peaceful', 'relaxed', 'romantic', 'sad']
Total number of mood categories: 10

Available time of day categories:
['afternoon', 'evening', 'morning', 'night']
Total number of time of day categories: 4

Model's context input shape: [(None, 14)]


IndexError: list index out of range