In [9]:
# Global Jukebox Lullaby Notebook

In [10]:
import pandas as pd
import plotly.express as px
from itertools import combinations
import numpy as np

In [11]:


import pandas as pd
import plotly.express as px
from itertools import combinations
import numpy as np

# import jgb data
# List of URLs to the data files
data_files_list = [
    'https://raw.githubusercontent.com/theglobaljukebox/cantometrics/main/raw/data.csv',
    'https://raw.githubusercontent.com/theglobaljukebox/cantometrics/main/raw/societies.csv',
    'https://raw.githubusercontent.com/theglobaljukebox/cantometrics/main/raw/songs.csv',
    'https://raw.githubusercontent.com/theglobaljukebox/cantometrics/main/etc/codes.csv',
    'https://raw.githubusercontent.com/theglobaljukebox/cantometrics/main/etc/variables.csv',
    'https://raw.githubusercontent.com/theglobaljukebox/cantometrics/main/etc/raw_codes.csv'
]

# Short names for DataFrames
short_names = ['canto', 'societies', 'songs', 'codes', 'lines_explained', 'raw_codes']

# Initialize empty variables for each DataFrame
canto = None
societies = None
songs = None
codes = None
lines_explained = None
raw_codes = None

# Loop through the list of URLs and short names
for url, short_name in zip(data_files_list, short_names):
    # Read the CSV file from the URL into a DataFrame
    df = pd.read_csv(url)
    
    # Replace non-breaking spaces in column names with regular spaces
    df.columns = df.columns.str.replace('\xa0', ' ')
    
    # Iterate over each column to replace non-breaking spaces in cell values
    for col in df.columns:
        # Check if the column contains string values
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace('\xa0', ' ')
    
    # Assign the modified DataFrame to the corresponding variable
    globals()[short_name] = df

# lines explained dictionary of lines and short titles
my_dict = pd.Series(lines_explained.short_title.values, index=lines_explained.id).to_dict()
my_dict

{'line_1': 'Social Org Vocal',
 'line_2': 'Social Org Voc/Orch',
 'line_3': 'Social Org Orch',
 'line_4': 'Musical Org Vocal',
 'line_5': 'Tonal Blend Vocal',
 'line_6': 'Rhythm Blend Vocal',
 'line_7': 'Music Orch Org',
 'line_8': 'Tonal Blend Orch',
 'line_9': 'Rhythm Blend Orch',
 'line_10': 'Text Repetition',
 'line_11': 'Overall Voc Rhythm',
 'line_12': "Rhythm Rel'n Vocal",
 'line_13': 'Overall Orch Rhythm',
 'line_14': "Rythm Rel'n Orch",
 'line_15': 'Melodic Shape',
 'line_16': 'Melodic Form',
 'line_17': 'Phrase Length',
 'line_18': 'No. Phrases',
 'line_19': 'Pos Final Tone',
 'line_20': 'Melodic Range',
 'line_21': 'Interval Width',
 'line_22': 'Polyphonic Type',
 'line_23': 'Embellishment',
 'line_24': 'Tempo',
 'line_25': 'Volume',
 'line_26': 'Vocal Rubato',
 'line_27': 'Orch Rubato',
 'line_28': 'Glissando',
 'line_29': 'Melisma',
 'line_30': 'Tremolo',
 'line_31': 'Glottal',
 'line_32': 'Vocal Register',
 'line_33': 'Vocal Width',
 'line_34': 'Nasality',
 'line_35': 'Ra

## Powers of 2 and Unpacking Canto

In [12]:


#  code to unpack powers of 2
# 2 to the n for all n values from 1 to 13
powers = [2**n for n in range(1, 14)] 

# make a list of all combinations of the previous, for 1, 2, and 3 numbers
combo_list = list(combinations(powers, 1)) + list(combinations(powers, 2)) + list(combinations(powers, 3)) 

# a dictionary that maps the original sums to the combinations
sums = [{"sum" : sum(t), "full_tuple": t} for t in combo_list]  

# as a df
sums_df = pd.DataFrame(sums) 

# clean up tuples and sort
sums_df['sorted_original_values'] = sums_df.full_tuple.apply(lambda x: tuple(sorted([np.log2(value) for value in x], reverse=True))) 
sums_df.sort_values(by="sum") 

#create a dictionary that maps the summed values to their original meanings:
dictionary_of_value_sets = dict(zip(sums_df["sum"], sums_df["sorted_original_values"]))

# create ictionary of lines and short title meanings
short_title_dict = pd.Series(lines_explained.short_title.values, index=lines_explained.id).to_dict() 

# unpack the sums for all 'line' columns, using dict of value sets created above, using only the 'line' cols
canto_transformed_features = canto.iloc[:, 3:].map(lambda x : dictionary_of_value_sets.get(x, 0)) 

# put the transformed columns back in place with the song names, etc
canto_unpacked = pd.concat([canto.iloc[:, :3], canto_transformed_features], axis="columns") 

 # rename the columns with short_title dictionary
canto_renamed = canto_unpacked.rename(columns=my_dict)
canto_unpacked = canto_renamed
canto_unpacked.head(3)


Unnamed: 0,song_id,Preferred_name,society_id,Social Org Vocal,Social Org Voc/Orch,Social Org Orch,Musical Org Vocal,Tonal Blend Vocal,Rhythm Blend Vocal,Music Orch Org,...,Glissando,Melisma,Tremolo,Glottal,Vocal Register,Vocal Width,Nasality,Rasp,Accent,Enunciation
0,4241,'Are'are,10000,"(6.0,)","(1.0,)","(1.0,)","(13.0,)","(10.0,)","(10.0,)","(1.0,)",...,"(9.0,)","(7.0,)","(4.0,)","(13.0,)","(10.0,)","(10.0,)","(4.0,)","(7.0,)","(10.0,)","(7.0,)"
1,4246,'Are'are,10000,"(6.0,)","(12.0,)","(13.0,)","(7.0,)","(13.0,)","(13.0,)","(13.0,)",...,"(13.0,)","(1.0,)","(13.0,)","(7.0,)","(10.0,)","(13.0,)","(4.0,)","(10.0,)","(10.0,)","(13.0,)"
2,30075,'Are'are,10000,"(13.0, 4.0)","(1.0,)","(1.0,)","(13.0,)","(13.0,)","(10.0,)","(1.0,)",...,"(9.0,)","(1.0,)","(10.0,)","(13.0,)","(7.0, 4.0)","(10.0,)","(13.0,)","(13.0,)","(13.0,)","(13.0,)"


## Select Song Data Columns

In [13]:

# song metadata with selected columns

selected_song_cols = ['song_id',
 'Genre',
 'Performers',
 'Instruments',
 'Vocalist_gender',
 'Year',
 'society_id',
 'Region',
 'Division',
 'Subregion',
 'Area',
 'Local_latitude',
 'Local_longitude',
 'Preferred_name',
 'Society_location'
 ]

songs_some_cols = songs[selected_song_cols].copy()
songs_some_cols.head(3)


Unnamed: 0,song_id,Genre,Performers,Instruments,Vocalist_gender,Year,society_id,Region,Division,Subregion,Area,Local_latitude,Local_longitude,Preferred_name,Society_location
0,30119,Responsorial Song; Call & Response,"Daudi Mutekete, male chorus, drum",Male Voice; Male Voices; Drum,Men,1950,10027,Africa,East Africa,Great Lakes Africa,"Luhyaland, W Kenya",0.5,34.58,Abaluhya,"Western Province, Kenya"
1,30117,Responsorial Song; Call & Response,"Jeremiah Mukandal, Wanga musician",Male Voice,Men,1950,10027,Africa,East Africa,Great Lakes Africa,"Luhyaland, W Kenya",0.5,34.58,Abaluhya,"Western Province, Kenya"
2,30115,Responsorial Song; Call & Response,"Salome Nolega, group of children",Female Voice; Children's Voices,Mixed children; Women,1950,10027,Africa,East Africa,Great Lakes Africa,"Luhyaland, W Kenya",0.5,34.58,Abaluhya,"Western Province, Kenya"


## Explode Genres for Study

In [14]:
# split strings and explode:  'genre' as example
songs_some_cols = songs[selected_song_cols].copy()
songs_some_cols['Genre'].unique().tolist()

# selected output

['Responsorial Song; Call & Response',
 'Dance Song',
 'Ceremonial Song; Song For Royalty',
 'Spirit Song; Dance Song; Cult Song',
 "Boys' Song; Adolescents' Song",
 'Funeral Song; Mourning Song',
 "Wedding Song; Girls' Song; Responsorial Song",
 'Chant; Song For Royalty',
 "Men's Song; Song For Royalty"]

# copy the data so we avoid problems
songs_some_cols  = songs[selected_song_cols].copy()
# split the long strings at the ";"
songs_some_cols['Genre'] = songs_some_cols['Genre'].str.split(';')
# explode the complete df on the 'genre' column to tidy the data
songs_exploded = songs_some_cols.explode('Genre')
# remove trailing/leading spaces that might remain in the individual strings
songs_exploded["Genre"] = songs_exploded["Genre"].str.strip()
# fillnas
songs_exploded = songs_exploded.fillna('')
songs_exploded.head(3)


Unnamed: 0,song_id,Genre,Performers,Instruments,Vocalist_gender,Year,society_id,Region,Division,Subregion,Area,Local_latitude,Local_longitude,Preferred_name,Society_location
0,30119,Responsorial Song,"Daudi Mutekete, male chorus, drum",Male Voice; Male Voices; Drum,Men,1950,10027,Africa,East Africa,Great Lakes Africa,"Luhyaland, W Kenya",0.5,34.58,Abaluhya,"Western Province, Kenya"
0,30119,Call & Response,"Daudi Mutekete, male chorus, drum",Male Voice; Male Voices; Drum,Men,1950,10027,Africa,East Africa,Great Lakes Africa,"Luhyaland, W Kenya",0.5,34.58,Abaluhya,"Western Province, Kenya"
1,30117,Responsorial Song,"Jeremiah Mukandal, Wanga musician",Male Voice,Men,1950,10027,Africa,East Africa,Great Lakes Africa,"Luhyaland, W Kenya",0.5,34.58,Abaluhya,"Western Province, Kenya"


## Lullaby Demo

In [15]:
# get selected features


# copy to safeguard data
canto_selected_features =  canto.copy()

# song_id is number, so convert to string for matching
canto_selected_features['song_id'] = canto_selected_features['song_id'].astype('str')

# dict to rename columns with more useful names
lullaby_name_dict = {'line_1': 'Social_Org_Group', 
'line_10': 'Repetition',
'line_11': 'Vocal_Rhythm',
'line_16': 'Melodic_Form',
'line_18': 'Number_Phrases',
'line_20': 'Melodic_Range',
'line_24': 'Tempo',
'line_25': 'Volume',
'line_26': 'Vocal_Rubato',
'line_28': 'Glissando'}

# rename cols
canto_selected_features = canto_selected_features.rename(columns=lullaby_name_dict)

# Now we select only the columns (lines) that Anna suggests are relevant to the Lullaby Project
canto_selected_features = canto_selected_features.iloc[:,[0, 1, 2, 3, 12, 13, 20, 22, 26, 27, 28, 30]]

canto_selected_features.head(3)

Unnamed: 0,song_id,Preferred_name,society_id,Social_Org_Group,Repetition,Vocal_Rhythm,Number_Phrases,Melodic_Range,Tempo,Volume,Vocal_Rubato,Glissando
0,4241,'Are'are,10000,64,16,2048,512,1024,512,2,512,512
1,4246,'Are'are,10000,64,8192,512,8192,128,32,16,512,8192
2,30075,'Are'are,10000,8208,1024,64,2048,128,512,2,8192,512


In [16]:
# unpack powers of 2

# 2 to the n for all n values from 1 to 13
powers = [2**n for n in range(1, 14)] 
# make a list of all combinations of the previous, for 1, 2, and 3 numbers
combo_list = list(combinations(powers, 1)) + list(combinations(powers, 2)) + list(combinations(powers, 3)) 
# a dictionary that maps the original sums to the combinations
sums = [{"sum" : sum(t), "full_tuple": t} for t in combo_list]  
# as a df
sums_df = pd.DataFrame(sums) 
# clean up tuples and sort
sums_df['sorted_original_values'] = sums_df.full_tuple.apply(lambda x: tuple(sorted([np.log2(value) for value in x], reverse=True))) 
sums_df.sort_values(by="sum") 
#create a dictionary that maps the summed values to their original meanings:
dictionary_of_value_sets = dict(zip(sums_df["sum"], sums_df["sorted_original_values"]))
my_dict = pd.Series(lines_explained.short_title.values, index=lines_explained.id).to_dict()
canto_transformed_features = canto_selected_features.iloc[:, 3:].map(lambda x : dictionary_of_value_sets.get(x, 0))
canto_unpacked = pd.concat([canto_selected_features.iloc[:, :3], canto_transformed_features], axis="columns")
canto_unpacked = canto_unpacked.rename(columns=my_dict)
# fix song_id as string
canto_unpacked['song_id'] = canto_unpacked['song_id'].astype('str')

canto_unpacked.head(3)


Unnamed: 0,song_id,Preferred_name,society_id,Social_Org_Group,Repetition,Vocal_Rhythm,Number_Phrases,Melodic_Range,Tempo,Volume,Vocal_Rubato,Glissando
0,4241,'Are'are,10000,"(6.0,)","(4.0,)","(11.0,)","(9.0,)","(10.0,)","(9.0,)","(1.0,)","(9.0,)","(9.0,)"
1,4246,'Are'are,10000,"(6.0,)","(13.0,)","(9.0,)","(13.0,)","(7.0,)","(5.0,)","(4.0,)","(9.0,)","(13.0,)"
2,30075,'Are'are,10000,"(13.0, 4.0)","(10.0,)","(6.0,)","(11.0,)","(7.0,)","(9.0,)","(1.0,)","(13.0,)","(9.0,)"


In [17]:
# Find Lullabies from the Song Table, and Collect Metadata

# a shortlist of columns to use from songs table
selected_song_cols = ['song_id',
'Genre',
'Performers',
'Instruments',
'Vocalist_gender',
'Year',
'society_id',
'Region',
'Division',
'Subregion',
'Area',
'Local_latitude',
'Local_longitude',
'Preferred_name',
'Society_location'
]


# slicing out just the relevant columns
songs_some_cols = songs[selected_song_cols].copy()

# getting just the lullabies
lullabies_metadata = songs_some_cols[songs_some_cols['Genre'].notna() & songs_some_cols['Genre'].str.contains("Lullaby")]
lullabies_metadata.head(3)


Unnamed: 0,song_id,Genre,Performers,Instruments,Vocalist_gender,Year,society_id,Region,Division,Subregion,Area,Local_latitude,Local_longitude,Preferred_name,Society_location
139,1914,Lullaby,"Female solo, group of children",Female Voice; Children's Voices,Mixed children; Women,1954,11793,Africa,East Africa,Great Lakes Africa,"Busoga Kingdom, E Uganda",1.04,33.48,Basoga,"Bulamogi, Kaliro District, Busoga, Eastern Reg..."
328,420,Lullaby,Female solo,Female Voice,Women,1959,15577,Africa,Central Africa,N W Central Africa,N W Cameroon,6.09,10.3,Fut,"Bamenda, Mezam, Northwest Region, Cameroon"
329,2565,Lullaby,Female solo,Female Voice,Women,1959,15577,Africa,Central Africa,N W Central Africa,N W Cameroon,6.09,10.3,Fut,"Bamenda, Mezam, Northwest Region, Cameroon"


In [18]:
# combine the Feature Data with Context Data


lullabies_final = pd.merge(lullabies_metadata, canto_unpacked,
how='left', on='song_id')
lullabies_final = lullabies_final.fillna('')
lullabies_final.head(3)



Unnamed: 0,song_id,Genre,Performers,Instruments,Vocalist_gender,Year,society_id_x,Region,Division,Subregion,...,society_id_y,Social_Org_Group,Repetition,Vocal_Rhythm,Number_Phrases,Melodic_Range,Tempo,Volume,Vocal_Rubato,Glissando
0,1914,Lullaby,"Female solo, group of children",Female Voice; Children's Voices,Mixed children; Women,1954,11793,Africa,East Africa,Great Lakes Africa,...,11793,"(8.0,)","(10.0,)","(11.0,)","(13.0,)","(13.0,)","(9.0,)","(7.0,)","(13.0,)","(5.0,)"
1,420,Lullaby,Female solo,Female Voice,Women,1959,15577,Africa,Central Africa,N W Central Africa,...,15577,"(2.0,)","(4.0,)","(6.0,)","(3.0,)","(7.0,)","(5.0,)","(4.0,)","(13.0,)","(5.0,)"
2,2565,Lullaby,Female solo,Female Voice,Women,1959,15577,Africa,Central Africa,N W Central Africa,...,15577,"(2.0,)","(10.0,)","(6.0,)","(1.0,)","(7.0,)","(9.0,)","(4.0,)","(13.0,)","(9.0,)"


In [19]:
# groups based on features

grouped = lullabies_final.groupby(['Region', 'Melodic_Range'])['song_id'].count()
regional_lullabies = pd.DataFrame(grouped)
regional_lullabies = regional_lullabies.reset_index()
regional_lullabies = regional_lullabies.rename(columns={'song_id' : 'Song_Count'})
regional_lullabies



Unnamed: 0,Region,Melodic_Range,Song_Count
0,Africa,"(4.0,)",1
1,Africa,"(7.0,)",2
2,Africa,"(10.0,)",2
3,Africa,"(13.0,)",1
4,Australia,"(7.0,)",1
5,Central America,"(4.0,)",5
6,Central America,"(7.0,)",9
7,Central America,"(10.0,)",1
8,Central Asia,"(7.0,)",1
9,East Asia,"(7.0,)",1


In [22]:
# copy and prepare data
regional_lullabies_plot_data = regional_lullabies.copy()
regional_lullabies_plot_data['Melodic_Range'] = regional_lullabies_plot_data['Melodic_Range'].apply(lambda x: x[0])

# plot
fig = px.bar(regional_lullabies_plot_data, x="Region", y="Song_Count", color="Melodic_Range",
             category_orders={"Melodic_Range": sorted(set(regional_lullabies_plot_data['Melodic_Range']))},
			 labels={'Melodic_Range': 'Melodic Range'},
			 title="Comparison of Melodic Range Ratings for Lullabies across Regions")

# Show the figure
fig.show()