In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from scipy.stats import pearsonr
from collections import Counter

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

# Load and preliminary steps

In [2]:
dataset_genre = pd.read_json("database_genres_classification.json")
dataset_genre = dataset_genre.sort_index(axis=0)

In [3]:
dataset_genre.head(5)

Unnamed: 0,track_id,title_billboard,artist_billboard,title_spotify,artist_spotify,key,time_signature,mode,tempo,loudness,...,energy,valence,genre_list,weeks_list,ranks_list,highest_rank,weeks_on_chart,year,genre_main,main_genre
0,6RkUeQHTyqeqnqIygrqnt7,La Paloma,Billy Vaughn And His Orchestra,La Paloma,Billy Vaughn,0,4,1,108.158,-12.301,...,0.322,0.502,ballroom,[1958-08-08],[92],92.0,1,1958,,other
1,6eCsRg7OBi1zT2CEKKyoXN,Fever,Peggy Lee,Fever - Remastered,Peggy Lee,7,4,1,68.331,-21.305,...,0.0715,0.333,big band,[1958-08-08],[13],13.0,1,1958,,other
2,4FrMb1ckGBrhARbHySQKx5,Devoted To You,The Everly Brothers,Devoted to You,The Everly Brothers,3,4,1,90.523,-9.387,...,0.34,0.526,brill building pop,[1958-08-08],[43],43.0,1,1958,pop,pop
3,2ieO6C3dUH2LajM2wSOJfe,The Wizard,Jimmie Rodgers,The Wizard,Jimmie Rodgers,3,4,0,118.888,-10.72,...,0.596,0.906,cowboy western,[1958-08-08],[85],85.0,1,1958,country,other
4,6MV23t3HuEmNR9NmdSI6Ny,The Morning Side Of The Mountain,Tommy Edwards,The Morning Side Of The Mountain,Tommy Edwards,0,3,1,78.462,-12.623,...,0.299,0.52,deep adult standards,"[1959-03-01, 1959-03-08, 1959-03-15, 1959-03-2...","[93, , 79, , 47, , 32, , 27, , 31, ...",,21,1959,adult standards,other


In [4]:
years_list = []

for ind, item in dataset_genre['weeks_list'].iteritems():
    occurences = [] 
    
    for week in item:
        year = week[:4]
        if not(year in occurences):
            occurences.append(year)
            
    years_list.append(occurences)
    
dataset_genre['years_list'] = years_list

In [5]:
all_years2 = []

for ind, item in dataset_genre['years_list'].iteritems():
    for year in item:
        all_years2.append(year)
        
years2 = pd.Series(all_years2).value_counts().index

In [6]:
years2 = sorted(years2)[1:]

In [7]:
yearly_features2 = {'loudness': [], 'duration_ms': [], 'tempo': [], 'energy': [], 'valence': []}

In [8]:
#Genres we decided to keep
genres = ['pop', 'rock', 'folk', 'jazz', 'country', 'r&b and soul']

# Functions

In [9]:
def make_dataframe_per_genre(feature):
    df_ = pd.DataFrame(columns=genres)
    for genre in genres:
        df = dataset_genre.loc[dataset_genre['main_genre'] == genre]
        feature_per_genre = []
        for year in years2:
            df2 = df.loc[df['year'] == int(year)]
            temp = df2.loc[:, feature]
            if(len(temp) > 0):
                #remove values that represent missing values
                if(feature == 'energy' or feature == 'valence' or feature == 'duration_ms'):
                    a = list(filter(lambda x: x!= -1, temp.tolist()))
                    feature_per_genre.append(a)
                elif(feature == 'tempo'):
                    a = list(filter(lambda x: x!= 0, temp.tolist()))
                    b = list(filter(lambda x: x!= -1, a))
                    feature_per_genre.append(b)
                else:
                    feature_per_genre.append(temp.tolist())
            else:
                feature_per_genre.append([])
        df_[genre] = feature_per_genre
    df_['year'] = years2
    return df_

In [10]:
def mean_confidence_interval(data, confidence = 0.95):
    
    a = np.array(data)
    if(len(a) > 0):
        n = len(a)
        m, se = np.mean(a), scipy.stats.sem(a)
        h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    else:
        return np.nan, np.nan, np.nan
    return m, m-h, m+h

In [11]:
def compute_bounds(features):
    
    lower_bound, mean, upper_bound = [], [], []

    # Compute the 95% confidence interval
    for list_ in features:
        m, lb, ub = mean_confidence_interval(list_)
    
        lower_bound.append(lb)
        mean.append(m)
        upper_bound.append(ub)
        
    return lower_bound, mean, upper_bound

In [12]:
def plot_by_genre(df, title, min_, max_, savefig):
    # libraries and data
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd

 
    # Initialize the figure
    plt.style.use('seaborn-darkgrid')
    plt.figure(figsize=(20, 10))
 
    # create a color palette
    color = 'b'
 
    # multiple line plot
    num=0
    for column in df.drop('year', axis=1):
        num+=1
 
        # Find the right spot on the plot
        plt.subplot(3,3, num)
 
        # Plot the lineplot
        lower_bound, mean, upper_bound = compute_bounds(df[column].tolist())
        plt.fill_between(range(len(mean)), upper_bound, lower_bound, color = color, alpha = .3)
        plt.plot(df['year'], mean, color)

 
        # Not ticks everywhere
        if num in range(4) :
            plt.tick_params(labelbottom='off')
        if num not in [1,4] :
            plt.tick_params(labelleft='off')
            
        years = []
        for y in df['year']:
            if int(y)%5 == 0:
                years.append(y)
            else:
                years.append(' ')
        
        plt.xticks(years, rotation='vertical', fontsize=13)
        
 
        # Add title
        plt.title(column, loc='left', fontsize=15, color=color)
    
    # Same limits for everybody!
    #plt.xlim(1900, 2000)
    #print(mean, max_, min_)
        plt.ylim(min_, max_)
        plt.yticks(fontsize=13)

 
    # general title
    plt.suptitle(title + " change by year", fontsize=20, fontweight=0, color='black', y=0.97)
    #plt.savefig(savefig)

# Continuous features by genre 

In [13]:
df_ = make_dataframe_per_genre('loudness')
plot_by_genre_2(df_, 'Loudness', -20, 0, 'loudness_by_genre.png')

NameError: name 'plot_by_genre_2' is not defined

In [None]:
df_ = make_dataframe_per_genre('duration_ms')
plot_by_genre_2(df_, 'Duration', 100000, 450000, 'duration_by_genre.png')

In [None]:
df_ = make_dataframe_per_genre('tempo')
plot_by_genre_2(df_, 'Tempo', 75, 160, 'tempo_by_genre.png')

In [None]:
df_ = make_dataframe_per_genre('energy')
plot_by_genre_2(df_, 'Energy', 0.2, 1, 'energy_by_genre.png')

In [None]:
df_ = make_dataframe_per_genre('valence')
plot_by_genre_2(df_, 'Valence', 0.3, 1, 'valence_by_genre.png')

# Discrete features by genre

## Mode by genre

In [None]:
feature = 'mode'
df_m = pd.DataFrame(columns=genres)
for genre in genres:
    df = dataset_genre.loc[dataset_genre['main_genre'] == genre]
    feature_per_genre = []
    for year in years2:
        df2 = df.loc[df['year'] == int(year)]
        temp = df2.loc[:, feature]
        if(len(temp) > 0):
            a = list(filter(lambda x: x!= -1, temp.tolist()))
            counter = Counter(a)
            dict_ = dict(sorted(counter.items()))
            if 0 not in dict_.keys():
                dict_.update({0:0})
            if 1 not in dict_.keys():
                dict_.update({1:0})
            dict_ = dict(sorted(dict_.items()))
            feature_per_genre.append(dict_)
        else:
            feature_per_genre.append({})      
    df_m[genre] = feature_per_genre
df_m['year'] = years2

In [None]:
df_m.head(5)

In [None]:
# Initialize the figure
plt.style.use('seaborn-notebook')
plt.figure(figsize=(20, 10))
 
# Color of title for genre
color = 'black'
 
# Multiple line plot
num=0
for column in df_m.drop('year', axis=1):
    num+=1
 
    # Find the right spot on the plot
    plt.subplot(3,3, num)
 
    # Plot the lineplot
    list_of_list = []

    for counter in df_m[column]:
        row = []
        for key, value in counter.items():
            row.append(value)
        list_of_list.append(row)
 
    # We need to transform the data from raw data to percentage (fraction)
    data = pd.DataFrame(data=list_of_list)
    data_perc = data.divide(data.sum(axis=1), axis=0)
    
    
    if num in range(6):
        plt.stackplot(df_m['year'],  data_perc.iloc[:,1], data_perc.iloc[:,0])
    else:
        plt.stackplot(df_m['year'],  data_perc.iloc[:,1], data_perc.iloc[:,0], labels=['major','minor'])
        plt.legend(loc='lower right')

 
        # Not ticks everywhere
    if num in range(4) :
        plt.tick_params(labelbottom='off')
    if num not in [1,4] :
        plt.tick_params(labelleft='off')
            
    years = []
    for y in df_m['year']:
        if int(y)%5 == 0:
            years.append(y)
        else:
            years.append(' ')  
    plt.xticks(years, rotation='vertical', fontsize=14)
        
 
        # Add title
    plt.title(column, loc='left', fontsize=15, color=color)
    
#General title
plt.suptitle("Mode change by year", fontsize=20, fontweight=0, color='black', y=0.97)
#plt.savefig('mode_by_genre.png')

## Time signature by genre

In [None]:
feature = 'time_signature'
df_t = pd.DataFrame(columns=genres)
for genre in genres:
    df = dataset_genre.loc[dataset_genre['main_genre'] == genre]
    feature_per_genre = []
    for year in years2:
        df2 = df.loc[df['year'] == int(year)]
        temp = df2.loc[:, feature]
        if(len(temp) > 0):
            a = list(filter(lambda x: x!= -1, temp.tolist()))
            counter = Counter(a)
            dict_ = dict(sorted(counter.items()))
            if 1 not in dict_.keys():
                dict_.update({1:0})
            if 2 not in dict_.keys():
                dict_.update({2:0})
            if 3 not in dict_.keys():
                dict_.update({3:0})
            if 4 not in dict_.keys():
                dict_.update({4:0})
            if 5 not in dict_.keys():
                dict_.update({5:0})
            dict_ = dict(sorted(dict_.items()))
            feature_per_genre.append(dict_)
        else:
            feature_per_genre.append({})      
    df_t[genre] = feature_per_genre
df_t['year'] = years2

In [None]:
df_t.head(5)

In [None]:
# Initialize the figure
plt.style.use('seaborn-notebook')
plt.figure(figsize=(20, 10))
 
# Color of title for genre
color = 'black'
 
# Multiple line plot
num=0
for column in df_t.drop('year', axis=1):
    num+=1
 
    # Find the right spot on the plot
    plt.subplot(3,3, num)
 
    # Plot the lineplot
    list_of_list = []

    for counter in df_t[column]:
        row = []
        for key, value in counter.items():
            row.append(value)
        list_of_list.append(row)
 
    # We need to transform the data from raw data to percentage (fraction)
    data = pd.DataFrame(data=list_of_list)
    data_perc = data.divide(data.sum(axis=1), axis=0)
    
    
    if num in range(6):
        plt.stackplot(df_t['year'], data_perc.iloc[:,0], data_perc.iloc[:,1], data_perc.iloc[:,2], data_perc.iloc[:,3], data_perc.iloc[:,4])
    else:
        plt.stackplot(df_t['year'],  data_perc.iloc[:,0], data_perc.iloc[:,1], data_perc.iloc[:,2], data_perc.iloc[:,3], data_perc.iloc[:,4], labels=['1/4','2/4', '3/4', '4/4', '5/4'])
        plt.legend(loc='upper right')

 
        # Not ticks everywhere
    if num in range(4) :
        plt.tick_params(labelbottom='off')
    if num not in [1,4] :
        plt.tick_params(labelleft='off')
            
    years = []
    for y in df_t['year']:
        if int(y)%5 == 0:
            years.append(y)
        else:
            years.append(' ')  
    plt.xticks(years, rotation='vertical')
        
 
        # Add title
    plt.title(column, loc='left', fontsize=15, color=color)

#General title
plt.suptitle("Time signature change by year", fontsize=20, fontweight=0, color='black', y=0.97)
#plt.savefig('time_signature_by_year.png')

## Root by genre

In [None]:
feature = 'key'
df_r = pd.DataFrame(columns=genres)
for genre in genres:
    df = dataset_genre.loc[dataset_genre['main_genre'] == genre]
    feature_per_genre = []
    for year in years2:
        df2 = df.loc[df['year'] == int(year)]
        temp = df2.loc[:, feature]
        if(len(temp) > 0):
            a = list(filter(lambda x: x!= -1, temp.tolist()))
            counter = Counter(a)
            dict_ = dict(sorted(counter.items()))
            if 0 not in dict_.keys():
                dict_.update({0:0})
            if 1 not in dict_.keys():
                dict_.update({1:0})
            if 2 not in dict_.keys():
                dict_.update({2:0})
            if 3 not in dict_.keys():
                dict_.update({3:0})
            if 4 not in dict_.keys():
                dict_.update({4:0})
            if 5 not in dict_.keys():
                dict_.update({5:0})
            if 6 not in dict_.keys():
                dict_.update({6:0})
            if 7 not in dict_.keys():
                dict_.update({7:0})
            if 8 not in dict_.keys():
                dict_.update({8:0})
            if 9 not in dict_.keys():
                dict_.update({9:0})
            if 10 not in dict_.keys():
                dict_.update({10:0})
            if 11 not in dict_.keys():
                dict_.update({11:0})
            dict_ = dict(sorted(dict_.items()))
            feature_per_genre.append(dict_)
        else:
            feature_per_genre.append({})      
    df_r[genre] = feature_per_genre
df_r['year'] = years2

In [None]:
df_r.head()

In [None]:
# Initialize the figure
plt.style.use('seaborn-notebook')
plt.figure(figsize=(20, 10))
 
# Color of title for genre
color = 'black'
 
# Multiple line plot
num=0
for column in df_r.drop('year', axis=1):
    num+=1
 
    # Find the right spot on the plot
    plt.subplot(3,3, num)
 
    # Plot the lineplot
    list_of_list = []

    for counter in df_r[column]:
        row = []
        for key, value in counter.items():
            row.append(value)
        list_of_list.append(row)
 
    # We need to transform the data from raw data to percentage (fraction)
    data = pd.DataFrame(data=list_of_list)
    data_perc = data.divide(data.sum(axis=1), axis=0)
    
    
    if num in range(6):
        plt.stackplot(df_r['year'], data_perc.iloc[:,0], data_perc.iloc[:,1], data_perc.iloc[:,2], data_perc.iloc[:,3], data_perc.iloc[:,4], data_perc.iloc[:,5], data_perc.iloc[:,6], data_perc.iloc[:,7], data_perc.iloc[:,8], data_perc.iloc[:,9], data_perc.iloc[:,10], data_perc.iloc[:,11])
    else:
        plt.stackplot(df_r['year'],  data_perc.iloc[:,0], data_perc.iloc[:,1], data_perc.iloc[:,2], data_perc.iloc[:,3], data_perc.iloc[:,4], data_perc.iloc[:,5], data_perc.iloc[:,6], data_perc.iloc[:,7], data_perc.iloc[:,8], data_perc.iloc[:,9], data_perc.iloc[:,10], data_perc.iloc[:,11], labels=['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'])
        plt.legend(loc='lower right')

 
        # Not ticks everywhere
    if num in range(4) :
        plt.tick_params(labelbottom='off')
    if num not in [1,4] :
        plt.tick_params(labelleft='off')
            
    years = []
    for y in df_r['year']:
        if int(y)%5 == 0:
            years.append(y)
        else:
            years.append(' ')  
    plt.xticks(years, rotation='vertical')
        
 
        # Add title
    plt.title(column, loc='left', fontsize=15, color=color)
    
# Same limits for everybody!
#plt.xlim(1900, 2000)
#print(mean, max_, min_)
    #plt.ylim(min_, max_)

#General title
plt.suptitle("Root change by year", fontsize=20, fontweight=0, color='black', y=0.97)
#plt.savefig('root_by_year.png')