# Produce the tables for the README.md of PMQD

This notebook requires `numpy`, `pandas` and `tabulate`. Uncomment below to install the requirements.

In [None]:
!pip install numpy pandas tabulate gcsfs

In [5]:
import numpy as np
import pandas as pd

In [6]:
metadata = pd.read_csv("gs://pmqd/pmqd.csv", index_col="id")
metadata.sample(n=5)

Unnamed: 0_level_0,genre,artist,title,degradation_type,degradation_intensity,rating,sample_start,sample_filename
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
459,Acoustic,Emily Rubye,Wish You Were Here (Scaled Down Version),limiter,45.128028,4.0,118,897cf1c0361a4eb3a84486b16944d2e7.wav
920,Classical,Traditional,Mendelson On Wings Of Song,noise,47.615986,3.0,11,f2f260f71ee64405b7b3439894a91743.wav
47,Pop,Happy Republic,Illuminate The Night (Instrumental Version),original,0.0,4.0,68,6f4f49977a6541ffa2492480aceed778.wav
877,Hip Hop,Dylan Sitts,Rotation,noise,34.263129,4.0,21,04ef854f20944d49bd6041fd4fb0ce49.wav
732,Country,Victor Olsson,On The Farm 2,lowpass,0.732528,4.0,108,9a616793dcaa4a9e8c2cbe69d30a26e5.wav


In [7]:
counts_by_genre = metadata.groupby("genre")["title"].count().reset_index()
counts_by_genre = counts_by_genre.rename(columns={"genre": "Genre", "title": "Count"})
counts_by_genre = counts_by_genre.append(
    {"Count": f"**{len(metadata)}**", "Genre": "**All**"}, ignore_index=True)
print(counts_by_genre.to_markdown(index=False))

| Genre               | Count   |
|:--------------------|:--------|
| Acoustic            | 75      |
| Blues               | 75      |
| Classical           | 75      |
| Country             | 75      |
| Electronica & Dance | 75      |
| Funk                | 75      |
| Hip Hop             | 75      |
| Jazz                | 75      |
| Latin               | 75      |
| Pop                 | 75      |
| Reggae              | 75      |
| Rnb & Soul          | 75      |
| Rock                | 75      |
| **All**             | **975** |


In [8]:
n = 5
metadata["degradation_intensity_range"] = pd.cut(
    metadata["degradation_intensity"],
    bins=np.linspace(0, 100, n + 1),
    right=False,
)

degradation_keys = ["degradation_type", "degradation_intensity_range"]
rating_by_degradation = metadata.groupby(degradation_keys)["rating"].mean().reset_index()
rating_by_degradation = rating_by_degradation.pivot(
    index="degradation_intensity_range", columns="degradation_type")
rating_by_degradation.columns = rating_by_degradation.columns.droplevel()
rating_by_degradation.index.name = "Degradation intensity"
rating_by_degradation.columns = rating_by_degradation.columns.str.capitalize()

# Replace NaN with None for better markdown table formatting
rating_by_degradation = rating_by_degradation.where(pd.notnull(rating_by_degradation), None)

print("Rating by degradation type and intensity\n")
print(rating_by_degradation.to_markdown(floatfmt=".2f", missingval="-"))

Rating by degradation type and intensity

| Degradation intensity   |   Distortion |   Limiter |   Lowpass |   Noise |   Original |
|:------------------------|-------------:|----------:|----------:|--------:|-----------:|
| [0.0, 20.0)             |         3.05 |      4.04 |      3.97 |    3.47 |       4.02 |
| [20.0, 40.0)            |         2.69 |      3.72 |      4.00 |    3.00 |       -    |
| [40.0, 60.0)            |         2.39 |      3.86 |      3.82 |    2.37 |       -    |
| [60.0, 80.0)            |         2.17 |      3.90 |      3.55 |    1.78 |       -    |
| [80.0, 100.0)           |         1.59 |      3.74 |      3.31 |    1.37 |       -    |
