# Data cleanup and statistics

## Setup

Update dependencies

In [None]:
try:
  import google.colab
  %pip install matplotlib --upgrade
except:
  print('not on Google colab')

Load data

In [None]:
df = object()
is_colab = False
import pandas as pd 

try:
    from google.colab import drive
    drive.mount('/content/gdrive')
    df = pd.read_csv('gdrive/MyDrive/valid_data_list.csv')
    is_colab = True
except:
    df = pd.read_csv('valid_data_list.csv');

df['HasCoverArt'] = df.apply(lambda row: 1 if (row['CoverArtID'] != 0) else 0, axis=1)
genres = ['African', 'Asian', 'AvantGardeOrExperimental', 'Blues', 'Classical', 'Country', 'EasyListening', 'Electronic', 'Folk', 'HipHop', 'Jazz', 'LatinOrCarribean', 'Metal', 'Pop', 'Punk', 'RnBOrSoul', 'Rock']
df.head()

## Statistics

Explore the dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def plot_genrecount(data, log=True, group_cover=False):
    sns.set(rc={'figure.figsize':(16, 6)})
    if log == True:
        plt.yscale('log')
        plt.xscale('linear')
    else:
        plt.yscale('linear')
        plt.xscale('linear')
    ax = object()
    if group_cover == True:
        ax = sns.countplot(x='GenreCount', data=data, hue='HasCoverArt')
    else:
        ax = sns.countplot(x='GenreCount', data=data)
    for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
    plt.show()

def plot_class(data, log=True):
    sns.set(rc={'figure.figsize':(16, 6)})
    if log == True:
        plt.yscale('log')
        plt.xscale('linear')
    else:
        plt.yscale('linear')
        plt.xscale('linear')
    counts = pd.DataFrame()
    counts['Counts'] = data['Class'].value_counts()
    counts.sort_values(by='Counts', inplace=True, ascending=False)
    counts['Class'] = counts.index
    counts.head()
    sns.lineplot(x=counts.index, y=counts.Counts, legend=False) #uglier, but so much faster
    plt.show()

def plot_genre(data, log=True, mode='default'):
    sns.set(rc={'figure.figsize':(16, 6)})
    sums = pd.DataFrame(columns=data.columns)
    if log == True:
        plt.yscale('log')
        plt.xscale('linear')
    else:
        plt.yscale('linear')
        plt.xscale('linear')
    plt.xticks(rotation=90)
    ax = object()
    if mode == 'cover':
        sums.loc[0] = data.loc[data['HasCoverArt'] == True, genres + ['HasCoverArt']].sum()
        sums.loc[1] = data.loc[data['HasCoverArt'] == False, genres + ['HasCoverArt']].sum()
        sums['HasCoverArt'] = sums['HasCoverArt'].clip(upper=1)
        sums = sums[genres + ['HasCoverArt']].melt(id_vars=['HasCoverArt'], var_name='Genre', value_name='Count')
        ax = sns.barplot(x='Genre', y='Count', hue='HasCoverArt', data=sums)
    elif mode == 'neg_count':
        sums.loc[0] = samples[genres].sum()
        sums.loc[1] = len(samples.index) - sums.loc[0][genres]
        sums['Inverse'] = {0:0, 1:1}
        sums = sums[genres + ['Inverse']].melt(id_vars=['Inverse'], var_name='Genre', value_name='Count')
        ax = sns.barplot(x='Genre', y='Count', hue='Inverse', data=sums)
    else:
        sums.loc[0] = data[genres].sum()
        sums = sums[genres].melt(var_name='Genre', value_name='Count')
        ax = sns.barplot(x='Genre', y='Count', data=sums)
    for p in ax.patches:
          ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
    plt.show()

def corr_matrix(data): # co-occurence would be better...
    correlation_matrix = data[genres].corr(method='pearson').round(2)
    sns.set(rc={'figure.figsize':(15, 15)})
    sns.heatmap(data=correlation_matrix, annot=True)
    plt.show()

In [None]:
plot_genrecount(df, log = True, group_cover = True)
plot_genrecount(df, log = False, group_cover = True)

In [None]:
plot_genre(df, log=True, mode='cover')
plot_genre(df, log=False, mode='cover')

In [None]:
plot_class(df, log=True)
plot_class(df, log=False)

In [None]:
corr_matrix(df)

## Cleanup

Remove rows without genres and cover arts

In [None]:
cldf = df.copy(deep = True)
cldf.drop(cldf.loc[cldf['HasCoverArt']==0].index, inplace=True)
cldf.drop(cldf.loc[cldf['GenreCount']==0].index, inplace=True)
cldf.head()

In [None]:
plot_genrecount(cldf, log = True, group_cover = False)
plot_genrecount(cldf, log = False, group_cover = False)

In [None]:
plot_class(cldf, log=True)
plot_class(cldf, log=False)

In [None]:
corr_matrix(cldf)

## Resampling

Get a representative sample

This is not the best approach, but it's simple and it works... ¯\\_(ツ)_/¯

In [None]:
samples = pd.DataFrame(columns = df.columns)

for genre in genres:
    genre_samples = cldf[cldf[genre] == 1].sample(frac = 1).reset_index(drop = True)
    samples = pd.concat([samples, genre_samples.iloc[:4000]])

samples.drop_duplicates(subset=["CoverArtID"], keep='first', inplace = True)

samples.head()

In [None]:
plot_genrecount(samples, log = True, group_cover = False)
plot_genrecount(samples, log = False, group_cover = False)

In [None]:
plot_genre(df, log=True, mode='neg_count')
plot_genre(df, log=False, mode='neg_count')

In [None]:
is_colab = False

if is_colab:
    samples.drop(columns=['HasCoverArt']).to_csv('gdrive/MyDrive/output.csv', sep=',')
else:
    samples.drop(columns=['HasCoverArt']).to_csv('output.csv', sep=',')