In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics.pairwise import haversine_distances

In [2]:
df = pd.read_excel("Canadian Radio Playlist_V.21.xlsx", "Campus Radio Charts")
df.head(1)

  warn(msg)


Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M-MUSIC,A-ARTIST,P-PERFORMANCE,L-LYRICS,Unnamed: 24
0,2006-01-10 00:00:00,CJSR,Edmonton,AB,53.55,-113.5,4,Breakestra,US,"Los Angeles, CA",...,Indie,English,Yes,Mixed Group,Male Group,No,No,No,No,


In [3]:
df.drop('Unnamed: 24', axis=1, inplace=True)

# Helpers

In [4]:
def replace_nans(key:str, columns:list):
    """Given a key column and a list of columns to fix,
    this function will fill in nan values with the assumption
    that all values in the columns to fix will be the same
    for any given value in the key column.
    """
    
    # Create list of all given key values that have any samples with nan values in given columns
    for key_value in df[(df[columns].transpose().isna().any())][key].unique():
        #Create empy list to fill with most common values per column
        values = []
        for column in columns:
            mode = df[df[key] == key_value][column].mode()
            # if mode is returned, add to list
            if mode.shape[0] > 0:
                values.append(mode.iloc[0])
            # else add nan
            else:
                values.append(np.nan)
    
        df.loc[df[key] == key_value, columns] = values
        
        
        
        
        # #Create filters
        # any_nan = (df[columns].transpose().isna().any()) & (df[key] == key_value)
        # all_nan = (df[columns].transpose().isna().all()) & (df[key] == key_value)
        # #Get values from a row that contains not nan values
        # values = df[~all_nan].head(1)[columns].iloc[0]
        # #Replace nans with values
        # df.loc[any_nan, columns] = values.tolist()

In [5]:
def display_unique_locations(artist:str) -> pd.DataFrame:
    location_table = df[df['ARTIST NAME(S)'] == artist][['ARTIST NAME(S)','ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']]
    location_table['COUNT'] = ''
    grouped_locations = location_table.groupby(['ARTIST NAME(S)','ARTIST HOME LATITUDE','ARTIST HOME LONGITUDE']).count().reset_index()
    return grouped_locations


In [6]:
def replace_artist_location(artist:str, location:list) ->pd.DataFrame:
    filter = df['ARTIST NAME(S)'] == artist
    df.loc[filter, ['ARTIST HOME LATITUDE','ARTIST HOME LONGITUDE']] = location
    return display_unique_locations(artist)

In [7]:
def haversine(row:pd.Series) -> float:
    X = [math.radians(float(row[0])), math.radians(float(row[1]))]
    Y = [math.radians(float(row[2])), math.radians(float(row[3]))]
    return (haversine_distances([X,Y]) * 6371)[0][1]


In [8]:
def is_can_con(row:pd.Series) -> str:
    """Given row containing M A L P, determines
    Canadian Content status
    """
    if row.str.count("yes").sum() >=2:
        return 'yes'
    else:
        return 'no'

# Strings
- removed white spaces
- lowercase

In [11]:
# Column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('\s+', ' ', regex=True)

In [12]:
# Data
string_columns = df.select_dtypes(exclude=[np.number, np.datetime64]).columns

for column in string_columns:
    cleaned = df[column].astype(str)
    cleaned = cleaned.str.strip()
    cleaned = cleaned.str.replace('\s+', ' ', regex=True)
    cleaned = cleaned.str.lower()
    
    df[column] = cleaned
    
df.replace('nan',np.nan, inplace=True)

- placeholders to nan

In [13]:
df.replace(['', '-', '?'], np.nan, inplace=True)

Date Times

In [14]:
df['WEEK OF']

0        2006-01-10 00:00:00
1        2006-01-10 00:00:00
2        2006-01-10 00:00:00
3        2006-01-10 00:00:00
4        2006-01-10 00:00:00
                ...         
35854               3/2/1010
35855               3/2/1010
35856               3/2/1010
35857               3/2/1010
35858               3/2/1010
Name: WEEK OF, Length: 35859, dtype: object

- Looking in the dataset it appears 2010-03-02 is a missing week, which resembles this typo

In [15]:
dates = df['WEEK OF'].astype(str).str.split().str[0]
dates = dates.replace('3/2/1010', '2010-03-02')
df['WEEK OF'] = pd.to_datetime(dates)

In [16]:
df.isna().sum()[df.isna().sum() > 0]

STATION CITY                       1
STATION PROVINCE                   1
STATION LATITUDE                   1
STATION LONGITUDE                 11
CHART POSITION                    25
ARTIST NAME(S)                     1
ARTIST COUNTRY                   395
ARTIST HOME CITY                1746
ARTIST HOME LATITUDE            1872
ARTIST HOME LONGITUDE           1905
KM DISTANCE (HOME - STATION)    1872
ALBUM NAME                         1
LABEL NAME                         1
LABEL TYPE                        33
LANGUAGE OF MUSIC                190
VISIBLE ETHNIC MINORITY          341
CENSUS RACE CLASSIFICATION       340
ARTIST GENDER                    288
M-MUSIC                          188
A-ARTIST                         188
P-PERFORMANCE                    188
L-LYRICS                         191
dtype: int64

# Station
- match null values to existing data

In [17]:
replace_nans('STATION', ['STATION CITY',
                         'STATION PROVINCE',
                         'STATION LATITUDE',
                         'STATION LONGITUDE']  )

In [18]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST NAME(S)                     1
ARTIST COUNTRY                   395
ARTIST HOME CITY                1746
ARTIST HOME LATITUDE            1872
ARTIST HOME LONGITUDE           1905
KM DISTANCE (HOME - STATION)    1872
ALBUM NAME                         1
LABEL NAME                         1
LABEL TYPE                        33
LANGUAGE OF MUSIC                190
VISIBLE ETHNIC MINORITY          341
CENSUS RACE CLASSIFICATION       340
ARTIST GENDER                    288
M-MUSIC                          188
A-ARTIST                         188
P-PERFORMANCE                    188
L-LYRICS                         191
dtype: int64

# Artist

- remove record with no data

In [19]:
df[df['ARTIST NAME(S)'].isna()].values

array([[Timestamp('2007-07-17 00:00:00'), 'cfuv', 'victoria', 'bc',
        48.43, -123.35, '10', nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan]], dtype=object)

In [20]:
# Drop row with no artist data
df = df[~df['ARTIST NAME(S)'].isna()]

In [21]:
duplicate_location_set = df.groupby(['ARTIST NAME(S)','ARTIST HOME LATITUDE','ARTIST HOME LONGITUDE'])['ARTIST NAME(S)'].count().reset_index(name='count')
duplicates = duplicate_location_set['ARTIST NAME(S)'].value_counts()[ duplicate_location_set['ARTIST NAME(S)'].value_counts() >1]
duplicates

various                        29
birdman                         2
rhymekeepers                    2
john smith                      2
m.i.a.                          2
the dirty sample                2
j57                             2
justin bieber                   2
roots manuva meets wrongtom     2
alias and tarsier               2
d-sisive                        2
ghostface killah                2
bobby digital (rza)             2
rick ross                       2
Name: ARTIST NAME(S), dtype: int64

- give distinct names to 'various' artists grouped by shared lattitude/longitude

In [22]:

various_locations = df[df['ARTIST NAME(S)'] == 'various'].groupby(['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']).count().reset_index()
for i, location in enumerate(various_locations[['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']].values):
    filter = (df['ARTIST NAME(S)'] == 'various') & (df['ARTIST HOME LATITUDE'] == location[0]) & (df['ARTIST HOME LONGITUDE'] == location[1])
    df.loc[filter,['ARTIST NAME(S)']] = f'various_{i}'

- 1478 Remaining 'various' records have no location - will keep them for statistical information but exclude them from geographic visualizations.

In [23]:
df[df['ARTIST NAME(S)'] == 'various'].shape

(1478, 24)

- artists with more than 1 lattitude/longitude fixed to only have 1

In [24]:
duplicate_table = pd.DataFrame()
for artist in duplicates.index:
    duplicate_table = pd.concat([duplicate_table, display_unique_locations(artist)])
duplicate_table

Unnamed: 0,ARTIST NAME(S),ARTIST HOME LATITUDE,ARTIST HOME LONGITUDE,COUNT
0,birdman,29.95,-90.07,4
1,birdman,29.95,90.07,1
0,rhymekeepers,39.76,-84.19,1
1,rhymekeepers,43.66,-79.42,12
0,john smith,49.88,-97.16,52
1,john smith,49.88,-97.17,2
0,m.i.a.,42.33,-83.04,1
1,m.i.a.,51.5,-0.117,65
0,the dirty sample,49.25,-123.13,56
1,the dirty sample,51.08,-114.08,1


- match null values to existing information

In [25]:
replace_artist_location('birdman',                      [29.95, -90.07])
replace_artist_location('rhymekeepers',                 [43.66, -79.42])
replace_artist_location('john smith',                   [49.88, -97.16])
replace_artist_location('m.i.a.',                       [51.50, -0.117])
replace_artist_location('the dirty sample',             [49.25, -123.13])
replace_artist_location('j57',                          [40.71, -74])
replace_artist_location('justin bieber',                [43.03, -80.98])
replace_artist_location('roots manuva meets wrongtom',  [51.50, -0.117])
replace_artist_location('alias and tarsier',            [43.66, -70.25])
replace_artist_location('d-sisive',                     [43.66, -79.42])
replace_artist_location('ghostface killah',             [40.71, -74])
replace_artist_location('bobby digital (rza)',          [40.65, -73.95])
replace_artist_location('rick ross',                    [25.94, -80.24])

Unnamed: 0,ARTIST NAME(S),ARTIST HOME LATITUDE,ARTIST HOME LONGITUDE,COUNT
0,rick ross,25.94,-80.24,63


- match null values to existing information

In [26]:
replace_nans('ARTIST NAME(S)', ['ARTIST HOME CITY',
                                'ARTIST HOME LATITUDE',
                                'ARTIST HOME LONGITUDE',
                                'VISIBLE ETHNIC MINORITY', 
                                'CENSUS RACE CLASSIFICATION', 
                                'ARTIST GENDER',
                                'M-MUSIC',
                                'A-ARTIST',
                                'P-PERFORMANCE',
                                'L-LYRICS'
                                ]  )

In [27]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST COUNTRY                   394
ARTIST HOME CITY                 297
ARTIST HOME LATITUDE             421
ARTIST HOME LONGITUDE           1902
KM DISTANCE (HOME - STATION)    1871
LABEL TYPE                        32
LANGUAGE OF MUSIC                189
VISIBLE ETHNIC MINORITY          211
CENSUS RACE CLASSIFICATION       211
ARTIST GENDER                    159
M-MUSIC                          125
A-ARTIST                         125
P-PERFORMANCE                    125
L-LYRICS                         126
dtype: int64

- replace MAPL nans with 'no'
- rplace remaining nans with 'unknown' 

In [28]:
df['ARTIST COUNTRY'].unique()

array(['us', 'uk', 'int', 'cdn', 'us/cdn', nan], dtype=object)

In [29]:
df['ARTIST GENDER'].unique()

array(['male group', 'male', 'mixed group', 'female', nan, 'female group',
       'no'], dtype=object)

In [30]:
df['VISIBLE ETHNIC MINORITY'].unique()

array(['yes', 'no', nan, 'black'], dtype=object)

In [31]:
df['CENSUS RACE CLASSIFICATION'].unique()

array(['mixed group', 'white', 'black', 'hispanic', 'asian other',
       'asian', nan, 'native american', 'native canadian',
       'middle eastern', 'unidentified', 'asian indian', 'jewish',
       'romany', 'other asian', 'indian', 'indian asian', 'inuit',
       'metis', 'male', 'east asian'], dtype=object)

In [32]:
df['M-MUSIC'].unique()

array(['no', 'yes', nan], dtype=object)

In [34]:
# Columns with NO existing category suitable for nan replacement
for key in ['ARTIST COUNTRY', 'ARTIST HOME CITY', 'ARTIST GENDER', 'VISIBLE ETHNIC MINORITY', 'CENSUS RACE CLASSIFICATION']:
    df[key].replace(np.nan, 'unknown', inplace=True)
# Columns with existing category suitable for nan replacement
for key in ['M-MUSIC', 'A-ARTIST', 'P-PERFORMANCE', 'L-LYRICS']:
    df[key].replace(np.nan, 'no', inplace=True)





In [35]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST HOME LATITUDE             421
ARTIST HOME LONGITUDE           1902
KM DISTANCE (HOME - STATION)    1871
LABEL TYPE                        32
LANGUAGE OF MUSIC                189
dtype: int64

In [36]:
key = 'ARTIST NAME(S)'
columns = ['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']
df[(df[columns].transpose().isna().any())][key].unique().shape

(58,)

# Album

- match to existing values

In [37]:
replace_nans('ALBUM NAME', ['LANGUAGE OF MUSIC'])

In [38]:
df['LANGUAGE OF MUSIC'].unique()

array(['english', 'french', nan, 'multi', 'spanish', 'portuguese',
       'english/patois', 'basque', 'other', 'english/arabic', 'german',
       'e', 'english/zulu', 'english/french', 'yes', 'various', 'punjabi',
       'creole'], dtype=object)

- replace e with english
- replace yes with other
- replace nan with 'unkown'


In [39]:
df['LANGUAGE OF MUSIC'].replace('e', 'english', inplace=True)
df['LANGUAGE OF MUSIC'].replace('yes', 'other', inplace=True)
df['LANGUAGE OF MUSIC'].replace(np.nan, 'unknown', inplace=True)

In [40]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST HOME LATITUDE             421
ARTIST HOME LONGITUDE           1902
KM DISTANCE (HOME - STATION)    1871
LABEL TYPE                        32
dtype: int64

# Label

- match null values to existing information

In [41]:
replace_nans('LABEL NAME', ['LABEL TYPE'])

- replace 'english' with 'indie'

In [42]:
df['LABEL TYPE'].unique()

array(['indie', 'major', 'self', nan, 'english'], dtype=object)

In [43]:
df['LABEL TYPE'].replace('english', 'indie', inplace=True)
df['LABEL TYPE'].replace(np.nan, 'unknown', inplace=True)

In [44]:
df[df['LABEL TYPE']== 'unknown']['LABEL NAME'].unique()

array(['hr', 'carhartt/because', 'homegrown inc.', 'the hip hop company',
       'word supremacy press'], dtype=object)

In [45]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST HOME LATITUDE             421
ARTIST HOME LONGITUDE           1902
KM DISTANCE (HOME - STATION)    1871
dtype: int64

# Chart Position

In [46]:
df['CHART POSITION'].unique()

array(['4', '5', '7', '8', '9', '10', '3', '2', '6', '1', nan],
      dtype=object)

In [47]:
df['CHART POSITION'].value_counts(dropna=False).sort_index()

1      3561
10     3585
2      3586
3      3586
4      3585
5      3586
6      3586
7      3586
8      3586
9      3586
NaN      25
Name: CHART POSITION, dtype: int64

In [48]:
nan_filter = df['CHART POSITION'].isna()
artists_with_nan_chart_numbers = df[ nan_filter]['ARTIST NAME(S)']
artists_with_nan_chart_numbers

552         psyche origami
1179             aceyalone
2063       dilated peoples
3048             jay bizzy
3657              ok cobra
10570                  iam
11716       pharoahe monch
14923    jesse dangerously
15204              othello
17819       cadence weapon
18242       guilty simpson
18345           atmosphere
19267           invincible
19833              j'davey
20428               factor
20778         art of fresh
21168        josh martinez
21614         art of fresh
21828          ghettosocks
22696               p.o.s.
23007               k'naan
25087        le klub des 7
26107           various_19
31753            the roots
34058             dj brace
Name: ARTIST NAME(S), dtype: object

- replace missing chart position with average for artist

In [49]:
# replace nan values in chart numbers with average chart position held by artist
for name in artists_with_nan_chart_numbers:
    values = df.loc[df['ARTIST NAME(S)'] == name, 'CHART POSITION'].astype(float)
    values.replace(np.nan, values.mean().round(), inplace=True)
    df.loc[df['ARTIST NAME(S)'] == name, 'CHART POSITION'] = values.astype(int)

# Distance
  - recalculate the haversine distance between station and artist for every record 

In [50]:
df['KM DISTANCE (HOME - STATION)'] = df[['STATION LATITUDE', 
                                                     'STATION LONGITUDE', 
                                                     'ARTIST HOME LATITUDE', 
                                                     'ARTIST HOME LONGITUDE']
                                                    ].apply(haversine, axis=1)



In [51]:
df.isna().sum()[df.isna().sum() > 0]

ARTIST HOME LATITUDE             421
ARTIST HOME LONGITUDE           1902
KM DISTANCE (HOME - STATION)    1902
dtype: int64

## Add canadian-content status (met by having at least 2 MAPL ratings)

In [52]:
df.rename(columns={"M-MUSIC":"M", "A-ARTIST": "A", "P-PERFORMANCE":"P","L-LYRICS":"L"}, inplace = True)


In [53]:
df['CANADIAN CONTENT'] = df[['M','A','L','P']].apply(is_can_con, axis=1)

In [54]:
df.to_csv('clean_data.csv')

In [55]:
df.head(10)

Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M,A,P,L,CANADIAN CONTENT
0,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,4,breakestra,us,"los angeles, ca",...,indie,english,yes,mixed group,male group,no,no,no,no,no
1,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,5,candy's .22,us,"los angeles, ca",...,indie,english,no,white,male group,no,no,no,no,no
2,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,7,dangerdoom,us,"new york, ny",...,indie,english,yes,black,male,no,no,no,no,no
3,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,8,blockhead,us,"new york, ny",...,indie,english,no,white,male,no,no,no,no,no
4,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,9,blackalicious,us,"sacramento, ca",...,indie,english,yes,black,male group,no,no,no,no,no
5,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,10,onry ozzborn,us,"seattle, wa",...,indie,english,yes,hispanic,male,no,no,no,no,no
6,2006-01-10,cjsw,calgary,ab,51.08,-114.08,3,various,us,"san francisco, ca",...,indie,english,yes,mixed group,mixed group,yes,no,no,no,no
7,2006-01-10,cjsw,calgary,ab,51.08,-114.08,5,psyche origami,us,"atlanta, ga",...,indie,english,yes,mixed group,male group,no,no,no,no,no
8,2006-01-10,cjsw,calgary,ab,51.08,-114.08,7,various,us,"san francisco, ca",...,indie,english,yes,mixed group,mixed group,yes,no,no,no,no
9,2006-01-10,cjsw,calgary,ab,51.08,-114.08,8,mike ladd,us,"boston, mass",...,indie,english,yes,black,male group,no,no,no,no,no
