In [1]:
import pandas as pd
import numpy as np
from helpers import *
from constants import *
import re

### Load Data
- remove empty column
- clean column names

In [2]:
df = pd.read_excel("data/Canadian Radio Playlist_V.21.xlsx", "Campus Radio Charts")
df.drop('Unnamed: 24', axis=1, inplace=True)   # Drop empty colummn
df.columns = [re.sub('\s-\s|\s+|-', '_',x.lower().strip()) for x in df.columns]  #Clean column names


df.head(1)

  warn(msg)


Unnamed: 0,week_of,station,station_city,station_province,station_latitude,station_longitude,chart_position,artist_name(s),artist_country,artist_home_city,...,label_name,label_type,language_of_music,visible_ethnic_minority,census_race_classification,artist_gender,m_music,a_artist,p_performance,l_lyrics
0,2006-01-10 00:00:00,CJSR,Edmonton,AB,53.55,-113.5,4,Breakestra,US,"Los Angeles, CA",...,Ubiquity,Indie,English,Yes,Mixed Group,Male Group,No,No,No,No


### Strings
- remove extra white spaces
- lowercase
- convert placeholders to nan values

In [3]:

df = df.applymap(lambda s: re.sub('\s+',' ',s.strip().lower()) if type(s) == str else s)
df.replace(PLACEHOLDERS, np.nan, inplace=True)

### Date Times

- Change '3/2/1010' to '2010-03-02'
- Convert to datetime object

In [4]:
df.loc[df['week_of'] == '3/2/1010', 'week_of'] = '2010-03-02'
df['week_of'] = pd.to_datetime(df['week_of'])

### Change home cities with multiple entries to be only the first entry
- example: "Toronto, Canada/Kingston, Jamaica" would become "Toronto, Canada"

In [5]:
df['artist_home_city'] = df['artist_home_city'].str.split('/').str[0]

### Split 'various' artists into unique groupings
- although the artists are not known, those with the same demographic features and locations are grouped and considered the same

In [6]:
df = make_unique(df,'artist_name(s)',VALUE_EXCEPTIONS,ARTIST_COLUMNS)

### Synchronize Data by key identifiers
- 'identifiers' in this case are Artist Name, Station Name, or Album Name.  It is  assumed than the corresponding information to these identifiers should always be the same.
- Remove any records with missing identifiers
- Ensure that each instance of a station, artist, or ablum always has the same corresponding information

In [7]:
df = df[~df['artist_name(s)'].isna()]
df = df[~df['station'].isna()]
df = df[~df['album_name'].isna()]

In [8]:
df = syncrhonize_data(df, 'artist_home_city', ['artist_home_latitude', 'artist_home_longitude','artist_country'])
df = syncrhonize_data(df, 'station_city', ['station_latitude', 'station_longitude'])
df = syncrhonize_data(df, 'station', STATION_COLUMNS)
df = syncrhonize_data(df, 'artist_name(s)', ARTIST_COLUMNS)
df = syncrhonize_data(df, 'album_name', ALBUM_COLUMNS)

100%|██████████| 334/334 [00:02<00:00, 122.64it/s]
100%|██████████| 22/22 [00:00<00:00, 156.64it/s]
100%|██████████| 25/25 [00:00<00:00, 68.97it/s]
100%|██████████| 2058/2058 [01:04<00:00, 32.10it/s]
100%|██████████| 2673/2673 [00:22<00:00, 120.16it/s]


### Replace nans in string columns with 'unknown'

In [9]:
df[df.select_dtypes(exclude=np.number).columns] = df.select_dtypes(exclude=np.number).fillna('unknown')

### Give lattitude and longitude to locations without them

In [10]:

df[~(df['artist_home_city'] == 'unknown') & df['artist_home_latitude'].isna()][ARTIST_COLUMNS].drop_duplicates()

Unnamed: 0,artist_country,artist_home_city,artist_home_latitude,artist_home_longitude,visible_ethnic_minority,census_race_classification,artist_gender,m_music,a_artist,p_performance,l_lyrics
7526,us,california,,,yes,black,male,no,no,no,no
8593,int,germany,,,no,white,male group,no,no,no,no
14606,int,spain,,,no,white,male,no,no,no,no
18732,int,jamaica,,,yes,black,mixed group,no,no,no,no
23036,int,france,,,no,white,male,no,no,no,no
28442,cdn,"fort mcmurray, ab",,,no,white,male,yes,yes,yes,yes


In [11]:
df = assign_lat_long(df, 'fort mcmurray, ab', 56.72, -111.37)
df = assign_lat_long(df, 'california', 36.77, -119.41) #Center of state
df = assign_lat_long(df, 'germany', 52.52, 13.40) # Berlin
df = assign_lat_long(df, 'spain', 40.41, -3.70) #Madrid
df = assign_lat_long(df, 'jamaica', 18.01, -76.80) #Kingston
df = assign_lat_long(df, 'france', 48.85, 2.35) #Paris

### Gender values
- remove group designation from gender and create new column to identify groups from solo artists
- fit all values into 'male, 'mixed', 'female', 'unknown'

In [12]:
df['artist_gender'].value_counts(dropna=False)

male            20180
male group      11765
mixed group      2214
female           1316
unknown           285
female group       97
no                  1
Name: artist_gender, dtype: int64

In [13]:
df['artist_is_group'] = df['artist_gender'].str.contains('group')
df['artist_gender'].replace('male group', 'male', inplace=True)
df['artist_gender'].replace('mixed group', 'mixed', inplace=True)
df['artist_gender'].replace('female group', 'female', inplace=True)
df['artist_gender'].replace('no', 'unknown', inplace=True)
df['artist_gender'].value_counts(dropna=False)

male       31945
mixed       2214
female      1413
unknown      286
Name: artist_gender, dtype: int64

### Visible ethnic minority values
- one instance of "black" is changed to "yes"

In [14]:
df['visible_ethnic_minority'].value_counts(dropna=False)

yes        23965
no         11555
unknown      337
black          1
Name: visible_ethnic_minority, dtype: int64

In [15]:
df['visible_ethnic_minority'].replace('black','yes', inplace=True)
df['visible_ethnic_minority'].value_counts(dropna=False)


yes        23966
no         11555
unknown      337
Name: visible_ethnic_minority, dtype: int64

### Census race classification values
- consolidate similar classifcations into more general categories

In [16]:
df['census_race_classification'].value_counts(dropna=False)

black              15445
white              11546
mixed group         6692
hispanic             499
asian                406
middle eastern       342
unknown              337
native canadian      160
asian indian         132
unidentified          68
other asian           66
indian asian          48
jewish                38
asian other           24
native american       21
inuit                 19
metis                  7
romany                 3
east asian             3
indian                 1
male                   1
Name: census_race_classification, dtype: int64

In [17]:
df['census_race_classification'].replace(['male','unidentified'], 'unknown', inplace=True)
df['census_race_classification'].replace(['asian other','other asian', 'east asian'],'asian', inplace=True)
df['census_race_classification'].replace(['asian indian','indian asian'],'indian', inplace=True)
df['census_race_classification'].replace(['native canadian','native american', 'inuit', 'metis'],'native american', inplace=True)
df['census_race_classification'].value_counts(dropna=False)


black              15445
white              11546
mixed group         6692
hispanic             499
asian                499
unknown              406
middle eastern       342
native american      207
indian               181
jewish                38
romany                 3
Name: census_race_classification, dtype: int64

### Label type
- replace one instance of 'english' to 'unknown'

In [18]:
df['label_type'].value_counts(dropna=False)

indie      27413
major       5027
self        3388
unknown       29
english        1
Name: label_type, dtype: int64

In [19]:
df['label_type'].replace(['english',np.nan], 'unknown', inplace=True)
df['label_type'].value_counts(dropna=False)

indie      27413
major       5027
self        3388
unknown       30
Name: label_type, dtype: int64

### Language of music
- consolidate classifications with multiple langues into 'multiple languages'
- consolidate unknowns

In [20]:
df['language_of_music'].value_counts(dropna=False)

english           34879
french              610
unknown             186
multi                72
english/arabic       38
spanish              20
english/french       13
other                 9
german                8
creole                5
english/patois        4
english/zulu          4
portuguese            4
basque                3
yes                   1
various               1
punjabi               1
Name: language_of_music, dtype: int64

In [21]:
multiple_languages = [  'english/arabic', 
                        'english/french', 
                        'english/patois',
                        'english/zulu', 
                        'various',
                        'multi']
df['language_of_music'].replace(multiple_languages, 'multiple languages', inplace= True)
df['language_of_music'].replace(['yes','other',np.nan], 'unknown', inplace= True)
df['language_of_music'].value_counts(dropna=False)

english               34879
french                  610
unknown                 196
multiple languages      132
spanish                  20
german                    8
creole                    5
portuguese                4
basque                    3
punjabi                   1
Name: language_of_music, dtype: int64

### Distance

- calculate haversine distance where missing and it is possible to

In [26]:
df['km_distance_(home_station)'].isna().value_counts()

False    34117
True      1741
Name: km_distance_(home_station), dtype: int64

In [27]:
row_filter = df['km_distance_(home_station)'].isna() & ~df['artist_home_latitude'].isna()
df.loc[row_filter, 'km_distance_(home_station)'] = haversine(df[row_filter]['station_latitude'],
                                                             df[row_filter]['station_longitude'], 
                                                             df[row_filter]['artist_home_latitude'],
                                                             df[row_filter]['artist_home_longitude'])

In [28]:
df['km_distance_(home_station)'].isna().value_counts()

False    34117
True      1741
Name: km_distance_(home_station), dtype: int64

## Add canadian-content status (met by having at least 2 MAPL ratings)

In [29]:
df['canadian_content'] = (df[['m_music','a_artist','l_lyrics','p_performance']] == 'yes').sum(axis=1) >=2

### Save data

In [30]:
df.to_csv('data/clean_data.csv')