# Data cleanup and statistics

## Setup

Update dependencies

In [None]:
is_colab = False
try:
  import google.colab
  !pip install matplotlib --upgrade
  !pip install iterative-stratification
  is_colab = True
except:
  print('not on Google colab')

Load data

In [None]:
df = object()
import pandas as pd 

try:
  from google.colab import drive
  drive.mount('/content/gdrive')
  df=pd.read_csv('gdrive/MyDrive/valid_data_list.csv')
except:
  df=pd.read_csv('valid_data_list.csv');

df['HasCoverArt'] = df.apply(lambda row: 1 if (row['CoverArtID'] != 0) else 0, axis=1)
df.head()

## Statistics

Explore the dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def GenreCountByHasCoverArt(df, logarithmic=True):
    sns.set(rc={'figure.figsize':(24, 8)})
    sns.color_palette("rocket", as_cmap=False)
    if logarithmic == True:
        plt.yscale('log')
        plt.xscale('linear')
    else:
        plt.yscale('linear')
        plt.xscale('linear')
    ax = sns.countplot(x='GenreCount', data=df, hue='HasCoverArt', palette=sns.color_palette('rocket', n_colors=2))
    for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
    plt.show()

def ClassCount(df, logarithmic=True):
    sns.set(rc={'figure.figsize':(24, 8)})
    sns.color_palette("rocket", as_cmap=False)
    if logarithmic == True:
        plt.yscale('log')
        plt.xscale('linear')
    else:
        plt.yscale('linear')
        plt.xscale('linear')
    ax = sns.histplot(x='Class', data=df, bins=10000)
    plt.show()

def CountByGenreGroupByHasCoverArt(trimmed_df, logarithmic=True):
    sns.set(rc={'figure.figsize':(24, 8)})
    sns.color_palette("rocket", as_cmap=False)
    sums = pd.DataFrame(columns=genres.columns)
    sums.loc[0] = genres.loc[genres['HasCoverArt'] == True].sum()
    sums.loc[1] = genres.loc[genres['HasCoverArt'] == False].sum()
    sums['HasCoverArt'] = sums['HasCoverArt'].clip(upper=1)
    sums = sums.melt(id_vars=['HasCoverArt'], var_name='Genre', value_name='Count')
    if logarithmic == True:
        plt.yscale('log')
        plt.xscale('linear')
    else:
        plt.yscale('linear')
        plt.xscale('linear')
    plt.xticks(rotation=90)
    ax = sns.barplot(x='Genre', y='Count', hue='HasCoverArt', data=sums, palette=sns.color_palette('rocket', n_colors=2))
    for p in ax.patches:
          ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
    plt.show()

def CorrelationGenre(trimmed_df): # co-occurence would be better...
    correlation_matrix = trimmed_df.corr(method='pearson').round(2)
    sns.set(rc={'figure.figsize':(15, 15)})
    sns.heatmap(data=correlation_matrix, annot=True)
    plt.show()

In [None]:
GenreCountByHasCoverArt(df, True)
GenreCountByHasCoverArt(df, False)

In [None]:
ClassCount(df, True)
ClassCount(df, False)

In [None]:
genres = df.copy().drop(columns=['Class', 'GroupID', 'ReleaseGUID', 'CoverArtID', 'ImageType', 'GenreCount'])
CountByGenreGroupByHasCoverArt(genres, True)
CountByGenreGroupByHasCoverArt(genres, False)

In [None]:
CorrelationGenre(genres)

## Cleanup

Remove rows without genres and cover arts

In [None]:
df.drop(df.loc[df['HasCoverArt']==0].index, inplace=True)
df.drop(df.loc[df['GenreCount']==0].index, inplace=True)
df.head()

In [None]:
GenreCountByHasCoverArt(df, True)
GenreCountByHasCoverArt(df, False)

In [None]:
ClassCount(df, True)
ClassCount(df, False)

In [None]:
genres = df.copy().drop(columns=['Class', 'GroupID', 'ReleaseGUID', 'CoverArtID', 'ImageType', 'GenreCount'])

In [None]:
CorrelationGenre(genres)

## Resampling

Get a representative sample