## Enrich Music4All Metadata with Mood, Time-of-Day, and Genre

This notebook will:
1. Load the base tables from `./Data/music4all/`  
2. Extract **mood** tags per song from `id_tags.csv` and map them to predefined moods
3. Derive **time_of_day** from the first listen timestamp in `listening_history.csv`  
4. Merge in the song’s **genre** from `id_genres.csv` and map other related genres to GTZAN 10 genres
5. Combine all into `id_metadata.csv` plus our new columns and save as `./Data/enriched_metadata.csv`

In [None]:
# %%
# 1. Imports and paths
import pandas as pd
from datetime import datetime

BASE_DIR = "./Data/music4all"
META_CSV    = f"{BASE_DIR}/id_metadata.csv"
TAGS_CSV    = f"{BASE_DIR}/id_tags.csv"
HIST_CSV    = f"{BASE_DIR}/listening_history.csv"
GEN_CSV     = f"{BASE_DIR}/id_genres.csv"
ID_INFO_CSV = f"{BASE_DIR}/id_information.csv"
GENRE_MAP_CSV = f"{BASE_DIR}/genre_map.csv"
MOOD_MAP_CSV = f"{BASE_DIR}/mood_map.csv"

OUT_CSV     = "./Data/cars_dataset.csv"


In [None]:
# %%
# 2. Load base tables
meta = pd.read_csv(META_CSV, sep="\t")
tags = pd.read_csv(TAGS_CSV, sep="\t", names=["id","tags"])
hist = pd.read_csv(HIST_CSV, sep="\t", names=["user","song","timestamp"])
genres = pd.read_csv(GEN_CSV, sep="\t", names=["id","genres"])
id_info = pd.read_csv(ID_INFO_CSV, sep="\t")

# Import mood and genre mappings from text files
def load_mapping_from_file(file_path):
    mapping = {}
    with open(file_path, 'r') as f:
        for line in f:
            # Skip comments and empty lines
            if line.strip() == '' or line.strip().startswith('#'):
                continue
            try:
                source, target = line.strip().split(',')
                mapping[source.strip()] = target.strip()
            except ValueError:
                # Skip lines that don't have exactly one comma
                continue
    return mapping

# Load mappings
MOOD_MAPPING = load_mapping_from_file(MOOD_MAP_CSV)
GENRE_MAP = load_mapping_from_file(GENRE_MAP_CSV)

MOOD_PRIORITY = [
    "happy", "sad", "angry", "romantic", "relaxed", "energetic", 
    "intense", "dreamy", "peaceful", "emotional", "groovy", 
    "melancholic", "sophisticated", "playful", "upbeat"
]

In [None]:
# %%
# 3. Extract a single 'mood' per song from tags with enhanced mapping
# explode tags
tags = tags.assign(tag_list=tags["tags"].str.split(","))
tags = tags.explode("tag_list").rename(columns={"tag_list":"tag"})

# Clean up tags (lowercase and strip)
tags["tag"] = tags["tag"].str.lower().str.strip()

# Match tags to standardized moods using our mapping
tags["mapped_mood"] = tags["tag"].map(MOOD_MAPPING)

# Drop rows where tag didn't map to any mood
mood_tags = tags.dropna(subset=["mapped_mood"]).copy()

# Function to pick highest priority mood when multiple moods are present
def pick_priority_mood(moods):
    # Convert to set to remove duplicates
    unique_moods = set(moods)
    
    # Sort by priority
    for mood in MOOD_PRIORITY:
        if mood in unique_moods:
            return mood
    # If no priority mood found (shouldn't happen if mapping is complete)
    return moods.iloc[0] if len(moods) > 0 else "neutral"

# Group by ID and get the highest priority mood
mood_tags = mood_tags.groupby("id")["mapped_mood"].apply(pick_priority_mood).reset_index()
mood_tags = mood_tags.rename(columns={"mapped_mood":"mood"})

In [None]:
# %%
# 4. Process listening history data for time of day
# Clean up the data by removing header rows before processing
hist = hist[hist["timestamp"] != "timestamp"]

# Now continue with your original processing
hist_first = (
    hist.groupby("song")["timestamp"]
        .min()
        .reset_index()
)

# Parse the timestamp strings
hist_first["dt"] = pd.to_datetime(
    hist_first["timestamp"],
    format="%Y-%m-%d %H:%M",
    errors="raise"
)

# Bucket into parts of day
def bucket_tod(hour):
    if 5 <= hour < 12:    return "morning"
    if 12 <= hour < 17:   return "afternoon"
    if 17 <= hour < 21:   return "evening"
    return "night"

hist_first["time_of_day"] = hist_first["dt"].dt.hour.apply(bucket_tod)

time_df = hist_first[["song","time_of_day"]]

In [None]:
# %%
# 5. Map genres to standardized categories
# Function to standardize genres
def standardize_genre(genre_list):
    # Convert to lowercase and strip
    genres_lower = [g.lower().strip() for g in genre_list]
    
    # Map to standardized genres
    mapped_genres = [GENRE_MAP.get(g, None) for g in genres_lower]
    
    # Filter out None values
    mapped_genres = [g for g in mapped_genres if g is not None]
    
    # Return the first valid mapped genre, or 'pop' as fallback
    return mapped_genres[0] if mapped_genres else "pop"

# Process the genres dataframe
# Split comma-separated genres into lists
genres = genres.assign(genre_list=genres["genres"].str.split(","))

# Apply the standardize_genre function
genres["standard_genre"] = genres["genre_list"].apply(standardize_genre)

# Select only the columns we need
genres = genres[["id", "standard_genre"]].rename(columns={"standard_genre": "genre"})

In [None]:
# %%
# 6. Merge everything together
time_df = time_df.rename(columns={"song": "id"})
df = meta.merge(mood_tags, on="id", how="left")
df = df.merge(time_df, on="id", how="left")
df = df.merge(genres, on="id", how="left")

id_info["id_label"] = id_info["artist"].str.strip() + " - " + id_info["song"].str.strip()
df = df.merge(id_info[["id", "id_label"]], on="id", how="left")

# Fill missing moods/time_of_day if any
df["mood"] = df["mood"].fillna("unknown")
df["time_of_day"] = df["time_of_day"].fillna("unknown")
df["genre"] = df["genre"].fillna("unknown")

In [None]:
# 7. Select and reorder columns, filter out neutral moods and unknown times, then save
out_cols = [
    "id_label","spotify_id","popularity","release","danceability","energy",
    "key","mode","valence","tempo","duration_ms",
    "mood","time_of_day","genre"
]

# Filter out entries with neutral moods or unknown times
filtered_df = df[(df["mood"] != "unknown") & (df["time_of_day"] != "unknown") & (df["genre"] != "unknown")]

# Save the filtered dataset
filtered_df[out_cols].to_csv(OUT_CSV, index=False)
filtered_df = filtered_df[out_cols].rename(columns={"id_label": "song"})
print(f"Enriched dataset saved to {OUT_CSV}")
print(f"Original dataset size: {len(df)} rows")
print(f"Filtered dataset size: {len(filtered_df)} rows")
print(f"Removed {len(df) - len(filtered_df)} rows with neutral mood or unknown time")

In [None]:
# Check if we're finding any mood tags that match our keys
print("Number of songs with mood tags:", len(mood_tags))
print("Mood distribution:")
print(mood_tags["mood"].value_counts())