In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics.pairwise import haversine_distances

In [2]:
df = pd.read_excel("Canadian Radio Playlist_V.21.xlsx", "Campus Radio Charts")
df.head(1)

  warn(msg)


Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M-MUSIC,A-ARTIST,P-PERFORMANCE,L-LYRICS,Unnamed: 24
0,2006-01-10 00:00:00,CJSR,Edmonton,AB,53.55,-113.5,4,Breakestra,US,"Los Angeles, CA",...,Indie,English,Yes,Mixed Group,Male Group,No,No,No,No,


In [3]:
df.drop('Unnamed: 24', axis=1, inplace=True)

# Helpers

In [29]:
def replace_nans(key_column:str, columns:list):
    """Given a key column and a list of columns to fix,
    this function will iterate through every value of the key column
    and fill any nan values in the columns to fix with the most frequent
    value in that column where the key value also occurs.
    """
    
    # Create list of all given key values that have any samples with nan values in given columns
    for key_value in df[(df[columns].transpose().isna().any())][key_column].unique():
        # unkown and various are fillers for unspecified artists
        if not key_value in ['unknown','various']:
            #Create empy list to fill with most common values per column
            values = []
            for column in columns:
                mode = df[df[key_column] == key_value][column].mode()
                # if mode is returned, add to list
                if mode.shape[0] > 0:
                    values.append(mode.iloc[0])
                # else add nan
                else:
                    values.append(np.nan)
        
            df.loc[df[key_column] == key_value, columns] = values

In [5]:
def display_unique_locations(artist:str) -> pd.DataFrame:
    location_table = df[df['ARTIST NAME(S)'] == artist][['ARTIST NAME(S)','ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']]
    location_table['COUNT'] = ''
    grouped_locations = location_table.groupby(['ARTIST NAME(S)','ARTIST HOME LATITUDE','ARTIST HOME LONGITUDE']).count().reset_index()
    return grouped_locations


In [6]:
def replace_artist_location(artist:str, location:list) ->pd.DataFrame:
    filter = df['ARTIST NAME(S)'] == artist
    df.loc[filter, ['ARTIST HOME LATITUDE','ARTIST HOME LONGITUDE']] = location
    return display_unique_locations(artist)

In [7]:
def haversine(row:pd.Series) -> float:
    X = [math.radians(float(row[0])), math.radians(float(row[1]))]
    Y = [math.radians(float(row[2])), math.radians(float(row[3]))]
    return (haversine_distances([X,Y]) * 6371)[0][1]


In [8]:
def is_can_con(row:pd.Series) -> str:
    """Given row containing M A L P, determines
    Canadian Content status
    """
    if row.str.count("yes").sum() >=2:
        return 'yes'
    else:
        return 'no'

In [9]:
def list_artists_by_city(cities:list)->pd.DataFrame:
    artist_list = []
    columns = ['ARTIST NAME(S)', 
               'ARTIST HOME LATITUDE',
               'ARTIST HOME LONGITUDE', 
               'ARTIST HOME CITY',
               'ARTIST COUNTRY']
    for city in cities:
        artist_list.append(df[df['ARTIST HOME CITY']== city])
    return pd.concat(artist_list).groupby(columns, dropna=False).count().reset_index()[columns]

# Strings
- removed white spaces
- lowercase

In [10]:
# Column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('\s+', ' ', regex=True)

In [11]:
# Data
string_columns = df.select_dtypes(exclude=[np.number, np.datetime64]).columns

for column in string_columns:
    cleaned = df[column].astype(str)
    cleaned = cleaned.str.strip()
    cleaned = cleaned.str.replace('\s+', ' ', regex=True)
    cleaned = cleaned.str.lower()
    
    df[column] = cleaned
    
df.replace('nan',np.nan, inplace=True)

- placeholders to nan

In [12]:
df.replace(['', '-', '?'], np.nan, inplace=True)

Date Times

In [13]:
df['WEEK OF']

0        2006-01-10 00:00:00
1        2006-01-10 00:00:00
2        2006-01-10 00:00:00
3        2006-01-10 00:00:00
4        2006-01-10 00:00:00
                ...         
35854               3/2/1010
35855               3/2/1010
35856               3/2/1010
35857               3/2/1010
35858               3/2/1010
Name: WEEK OF, Length: 35859, dtype: object

- Looking in the dataset it appears 2010-03-02 is a missing week, which resembles this typo

In [14]:
dates = df['WEEK OF'].astype(str).str.split().str[0]
dates = dates.replace('3/2/1010', '2010-03-02')
df['WEEK OF'] = pd.to_datetime(dates)

In [15]:
df.isna().sum()[df.isna().sum() > 0]

STATION CITY                       1
STATION PROVINCE                   1
STATION LATITUDE                   1
STATION LONGITUDE                 11
CHART POSITION                    25
ARTIST NAME(S)                     1
ARTIST COUNTRY                   395
ARTIST HOME CITY                1746
ARTIST HOME LATITUDE            1872
ARTIST HOME LONGITUDE           1905
KM DISTANCE (HOME - STATION)    1872
ALBUM NAME                         1
LABEL NAME                         1
LABEL TYPE                        33
LANGUAGE OF MUSIC                190
VISIBLE ETHNIC MINORITY          341
CENSUS RACE CLASSIFICATION       340
ARTIST GENDER                    288
M-MUSIC                          188
A-ARTIST                         188
P-PERFORMANCE                    188
L-LYRICS                         191
dtype: int64

# Station
- match null values to existing data

In [16]:
replace_nans('STATION', ['STATION CITY',
                         'STATION PROVINCE',
                         'STATION LATITUDE',
                         'STATION LONGITUDE']  )

In [17]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST NAME(S)                     1
ARTIST COUNTRY                   395
ARTIST HOME CITY                1746
ARTIST HOME LATITUDE            1872
ARTIST HOME LONGITUDE           1905
KM DISTANCE (HOME - STATION)    1872
ALBUM NAME                         1
LABEL NAME                         1
LABEL TYPE                        33
LANGUAGE OF MUSIC                190
VISIBLE ETHNIC MINORITY          341
CENSUS RACE CLASSIFICATION       340
ARTIST GENDER                    288
M-MUSIC                          188
A-ARTIST                         188
P-PERFORMANCE                    188
L-LYRICS                         191
dtype: int64

# Artist

- remove record with no data

In [18]:
df[df['ARTIST NAME(S)'].isna()].values

array([[Timestamp('2007-07-17 00:00:00'), 'cfuv', 'victoria', 'bc',
        48.43, -123.35, '10', nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan]], dtype=object)

In [19]:
# Drop row with no artist data
df = df[~df['ARTIST NAME(S)'].isna()]

In [20]:
duplicate_location_set = df.groupby(['ARTIST NAME(S)','ARTIST HOME LATITUDE','ARTIST HOME LONGITUDE'])['ARTIST NAME(S)'].count().reset_index(name='count')
duplicates = duplicate_location_set['ARTIST NAME(S)'].value_counts()[ duplicate_location_set['ARTIST NAME(S)'].value_counts() >1]
duplicates

various                        29
birdman                         2
rhymekeepers                    2
john smith                      2
m.i.a.                          2
the dirty sample                2
j57                             2
justin bieber                   2
roots manuva meets wrongtom     2
alias and tarsier               2
d-sisive                        2
ghostface killah                2
bobby digital (rza)             2
rick ross                       2
Name: ARTIST NAME(S), dtype: int64

- give distinct names to 'various' artists grouped by shared lattitude/longitude

In [21]:

various_locations = df[df['ARTIST NAME(S)'] == 'various'].groupby(['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']).count().reset_index()
for i, location in enumerate(various_locations[['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']].values):
    filter = (df['ARTIST NAME(S)'] == 'various') & (df['ARTIST HOME LATITUDE'] == location[0]) & (df['ARTIST HOME LONGITUDE'] == location[1])
    df.loc[filter,['ARTIST NAME(S)']] = f'various_{i}'

In [22]:
df[df['ARTIST NAME(S)'] == 'various'].shape

(1478, 24)

- remaining 'various' names are switch to unknown

In [23]:
df['ARTIST NAME(S)'].replace('various','unknown', inplace=True)

- artists with more than 1 lattitude/longitude fixed to only have 1

In [24]:
duplicate_table = pd.DataFrame()
for artist in duplicates.index:
    duplicate_table = pd.concat([duplicate_table, display_unique_locations(artist)])
duplicate_table

Unnamed: 0,ARTIST NAME(S),ARTIST HOME LATITUDE,ARTIST HOME LONGITUDE,COUNT
0,birdman,29.95,-90.07,4
1,birdman,29.95,90.07,1
0,rhymekeepers,39.76,-84.19,1
1,rhymekeepers,43.66,-79.42,12
0,john smith,49.88,-97.16,52
1,john smith,49.88,-97.17,2
0,m.i.a.,42.33,-83.04,1
1,m.i.a.,51.5,-0.117,65
0,the dirty sample,49.25,-123.13,56
1,the dirty sample,51.08,-114.08,1


- match null values to existing information

In [25]:
replace_artist_location('birdman',                      [29.95, -90.07])
replace_artist_location('rhymekeepers',                 [43.66, -79.42])
replace_artist_location('john smith',                   [49.88, -97.16])
replace_artist_location('m.i.a.',                       [51.50, -0.117])
replace_artist_location('the dirty sample',             [49.25, -123.13])
replace_artist_location('j57',                          [40.71, -74])
replace_artist_location('justin bieber',                [43.03, -80.98])
replace_artist_location('roots manuva meets wrongtom',  [51.50, -0.117])
replace_artist_location('alias and tarsier',            [43.66, -70.25])
replace_artist_location('d-sisive',                     [43.66, -79.42])
replace_artist_location('ghostface killah',             [40.71, -74])
replace_artist_location('bobby digital (rza)',          [40.65, -73.95])
replace_artist_location('rick ross',                    [25.94, -80.24])

Unnamed: 0,ARTIST NAME(S),ARTIST HOME LATITUDE,ARTIST HOME LONGITUDE,COUNT
0,rick ross,25.94,-80.24,63


- match null values to existing information

In [26]:
replace_nans('ARTIST NAME(S)', ['ARTIST HOME CITY',
                                'ARTIST HOME LATITUDE',
                                'ARTIST HOME LONGITUDE',
                                'VISIBLE ETHNIC MINORITY', 
                                'CENSUS RACE CLASSIFICATION', 
                                'ARTIST GENDER',
                                'M-MUSIC',
                                'A-ARTIST',
                                'P-PERFORMANCE',
                                'L-LYRICS'
                                ]  )

In [27]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST COUNTRY                   394
ARTIST HOME CITY                1744
ARTIST HOME LATITUDE            1869
ARTIST HOME LONGITUDE           1902
KM DISTANCE (HOME - STATION)    1871
LABEL TYPE                        32
LANGUAGE OF MUSIC                189
VISIBLE ETHNIC MINORITY          313
CENSUS RACE CLASSIFICATION       313
ARTIST GENDER                    261
M-MUSIC                          161
A-ARTIST                         161
P-PERFORMANCE                    161
L-LYRICS                         162
dtype: int64

- replace MAPL nans with 'no'
- rplace remaining nans with 'unknown' 

In [28]:
df['ARTIST COUNTRY'].value_counts(dropna=False)

us        18848
cdn       14186
int        1295
uk         1134
NaN         394
us/cdn        1
Name: ARTIST COUNTRY, dtype: int64

In [29]:
df['ARTIST COUNTRY'].fillna('unknown', inplace=True)
df['ARTIST COUNTRY'].replace('us/cdn', 'int', inplace=True)
df['ARTIST COUNTRY'].value_counts(dropna=False)

us         18848
cdn        14186
int         1296
uk          1134
unknown      394
Name: ARTIST COUNTRY, dtype: int64

In [30]:
df['ARTIST GENDER'].value_counts(dropna=False)

male            20176
male group      11756
mixed group      2246
female           1321
NaN               261
female group       97
no                  1
Name: ARTIST GENDER, dtype: int64

In [31]:
df['ARTIST GENDER'].replace([np.nan,'no'], 'unknown', inplace=True)
df['ARTIST GENDER'].replace('male group', 'male',inplace=True)
df['ARTIST GENDER'].replace('female group', 'female',inplace=True)
df['ARTIST GENDER'].value_counts(dropna=False)

male           31932
mixed group     2246
female          1418
unknown          262
Name: ARTIST GENDER, dtype: int64

In [32]:
df['VISIBLE ETHNIC MINORITY'].value_counts(dropna=False)

yes      23972
no       11572
NaN        313
black        1
Name: VISIBLE ETHNIC MINORITY, dtype: int64

In [33]:
df['VISIBLE ETHNIC MINORITY'].fillna('unknown', inplace=True)
df['VISIBLE ETHNIC MINORITY'].replace('black','yes', inplace=True)
df['VISIBLE ETHNIC MINORITY'].value_counts(dropna=False)

yes        23973
no         11572
unknown      313
Name: VISIBLE ETHNIC MINORITY, dtype: int64

In [34]:
df['CENSUS RACE CLASSIFICATION'].value_counts(dropna=False)

black              15438
white              11565
mixed group         6714
hispanic             499
asian                406
middle eastern       342
NaN                  313
native canadian      159
asian indian         125
other asian           66
unidentified          66
indian asian          48
jewish                38
asian other           24
native american       21
inuit                 19
metis                  7
romany                 3
east asian             3
indian                 1
male                   1
Name: CENSUS RACE CLASSIFICATION, dtype: int64

In [35]:
df[df['CENSUS RACE CLASSIFICATION']=='indian']['ARTIST NAME(S)']

21296    athavale
Name: ARTIST NAME(S), dtype: object

In [36]:
df['CENSUS RACE CLASSIFICATION'].replace([np.nan,'male','unidentified'], 'unknown', inplace=True)
df['CENSUS RACE CLASSIFICATION'].replace(['asian other','other asian', 'east asian'],'asian', inplace=True)
df['CENSUS RACE CLASSIFICATION'].replace(['asian indian','indian asian'],'indian', inplace=True)
df['CENSUS RACE CLASSIFICATION'].replace(['native canadian','native american', 'inuit', 'metis'],'native american', inplace=True)
df['CENSUS RACE CLASSIFICATION'].value_counts(dropna=False)

black              15438
white              11565
mixed group         6714
hispanic             499
asian                499
unknown              380
middle eastern       342
native american      206
indian               174
jewish                38
romany                 3
Name: CENSUS RACE CLASSIFICATION, dtype: int64

In [39]:
for key in ['M-MUSIC', 'A-ARTIST', 'P-PERFORMANCE', 'L-LYRICS']:
    df[key].fillna('no', inplace=True)





In [49]:
df['ARTIST HOME CITY'].fillna('unknown',inplace=True)

In [52]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST HOME LATITUDE            1853
ARTIST HOME LONGITUDE           1853
KM DISTANCE (HOME - STATION)    1871
LABEL TYPE                        32
LANGUAGE OF MUSIC                189
dtype: int64

- give lattitude and longittude to records without, and give each home city a single lat long

In [41]:
columns = ['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']

In [42]:
artist_without_cities = df[(df[columns].transpose().isna().any())]['ARTIST HOME CITY'].value_counts()
artist_without_cities

new york, ny/kingston, jamaica       45
saskatoon, sask/brussels, belgium    42
san francisco, ca                    30
detroit, mi/los angeles, ca           7
california                            7
los angeles, ca/belgium               6
winnipeg, mb                          5
germany                               4
kelowna, bc                           4
new york, ny                          3
fort mcmurray, ab                     3
spain                                 2
france                                2
jamaica                               1
Name: ARTIST HOME CITY, dtype: int64

In [51]:

columns = ['ARTIST HOME LATITUDE', 'ARTIST HOME LONGITUDE']
for city in df['ARTIST HOME CITY'].unique():
    if not city == 'unknown':
        location = df[df['ARTIST HOME CITY'] == city][columns].mode(dropna=False).iloc[0].to_list()
        df.loc[df['ARTIST HOME CITY'] == city, columns] = location
        


In [53]:
artist_without_cities = df[(df[columns].transpose().isna().any())]['ARTIST HOME CITY'].value_counts()
artist_without_cities

unknown                              1741
new york, ny/kingston, jamaica         45
saskatoon, sask/brussels, belgium      42
california                              7
los angeles, ca/belgium                 6
germany                                 4
fort mcmurray, ab                       3
spain                                   2
france                                  2
jamaica                                 1
Name: ARTIST HOME CITY, dtype: int64

In [54]:
list_artists_by_city(artist_without_cities.index.drop('unknown'))

Unnamed: 0,ARTIST NAME(S),ARTIST HOME LATITUDE,ARTIST HOME LONGITUDE,ARTIST HOME CITY,ARTIST COUNTRY
0,brawler,,,"fort mcmurray, ab",cdn
1,doctor flake,,,france,int
2,epic/nomad,,,"saskatoon, sask/brussels, belgium",int
3,geckoturner,,,spain,int
4,leroy brown,,,california,us
5,nas & damian marley,,,"new york, ny/kingston, jamaica",int
6,non + herrmutt lobby,,,"los angeles, ca/belgium",int
7,snowgoons,,,germany,int
8,unknown,,,jamaica,int


In [55]:
df[df['ARTIST HOME CITY']=='los angeles, ca'][columns].mode()

Unnamed: 0,ARTIST HOME LATITUDE,ARTIST HOME LONGITUDE
0,34.05,-118.24


In [56]:
replace_artist_location('brawler', [56.72, -111.37])
replace_artist_location('doctor flake', [48.86, 2.34]) #Paris
replace_artist_location('epic/nomad', [50.84, 4.35]) # Brussels
replace_artist_location('geckoturner', [40.41, -3.70]) #Madrid
replace_artist_location('leroy brown', [34.05,	-118.24])
replace_artist_location('nas & damian marley', [18.01, -76.80]) #Kingston, Jamaica
replace_artist_location('non + herrmutt lobby', [50.84, 4.35]) # Brussels
replace_artist_location('snowgoons', [52.52, 13.40]) #Berlin
df.loc[df['ARTIST HOME CITY'] == 'jamaica', columns] = [18.01, -76.80] #Kingston, Jamaica

In [57]:
df[(df[columns].transpose().isna().any())]['ARTIST HOME CITY'].value_counts()

unknown    1741
Name: ARTIST HOME CITY, dtype: int64

# Album

- match to existing values

In [58]:
replace_nans('ALBUM NAME', ['LANGUAGE OF MUSIC'])

In [59]:
df['LANGUAGE OF MUSIC'].unique()

array(['english', 'french', nan, 'multi', 'spanish', 'portuguese',
       'english/patois', 'basque', 'other', 'english/arabic', 'german',
       'e', 'english/zulu', 'english/french', 'yes', 'various', 'punjabi',
       'creole'], dtype=object)

- replace e with english
- replace yes with other
- replace nan with 'unkown'


In [60]:
df['LANGUAGE OF MUSIC'].replace('e', 'english', inplace=True)
df['LANGUAGE OF MUSIC'].replace('yes', 'other', inplace=True)
df['LANGUAGE OF MUSIC'].replace(np.nan, 'unknown', inplace=True)

In [61]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST HOME LATITUDE            1741
ARTIST HOME LONGITUDE           1741
KM DISTANCE (HOME - STATION)    1871
LABEL TYPE                        32
dtype: int64

# Label

- match null values to existing information

In [62]:
replace_nans('LABEL NAME', ['LABEL TYPE'])

- replace 'english' with 'indie'

In [63]:
df['LABEL TYPE'].unique()

array(['indie', 'major', 'self', nan, 'english'], dtype=object)

In [64]:
df['LABEL TYPE'].replace('english', 'indie', inplace=True)
df['LABEL TYPE'].replace(np.nan, 'unknown', inplace=True)

In [65]:
df[df['LABEL TYPE']== 'unknown']['LABEL NAME'].unique()

array(['hr', 'carhartt/because', 'homegrown inc.', 'the hip hop company',
       'word supremacy press'], dtype=object)

In [66]:
df.isna().sum()[df.isna().sum() > 0]

CHART POSITION                    25
ARTIST HOME LATITUDE            1741
ARTIST HOME LONGITUDE           1741
KM DISTANCE (HOME - STATION)    1871
dtype: int64

# Chart Position

In [67]:
df['CHART POSITION'].unique()

array(['4', '5', '7', '8', '9', '10', '3', '2', '6', '1', nan],
      dtype=object)

In [68]:
df['CHART POSITION'].value_counts(dropna=False).sort_index()

1      3561
10     3585
2      3586
3      3586
4      3585
5      3586
6      3586
7      3586
8      3586
9      3586
NaN      25
Name: CHART POSITION, dtype: int64

In [69]:
nan_filter = df['CHART POSITION'].isna()
artists_with_nan_chart_numbers = df[ nan_filter]['ARTIST NAME(S)']
artists_with_nan_chart_numbers

552         psyche origami
1179             aceyalone
2063       dilated peoples
3048             jay bizzy
3657              ok cobra
10570                  iam
11716       pharoahe monch
14923    jesse dangerously
15204              othello
17819       cadence weapon
18242       guilty simpson
18345           atmosphere
19267           invincible
19833              j'davey
20428               factor
20778         art of fresh
21168        josh martinez
21614         art of fresh
21828          ghettosocks
22696               p.o.s.
23007               k'naan
25087        le klub des 7
26107           various_19
31753            the roots
34058             dj brace
Name: ARTIST NAME(S), dtype: object

- replace missing chart position with average for artist

In [70]:
# replace nan values in chart numbers with average chart position held by artist
for name in artists_with_nan_chart_numbers:
    values = df.loc[df['ARTIST NAME(S)'] == name, 'CHART POSITION'].astype(float)
    values.replace(np.nan, values.mean().round(), inplace=True)
    df.loc[df['ARTIST NAME(S)'] == name, 'CHART POSITION'] = values.astype(int)

# Distance
  - recalculate the haversine distance between station and artist for every record 

In [71]:
df['KM DISTANCE (HOME - STATION)'] = df[['STATION LATITUDE', 
                                                     'STATION LONGITUDE', 
                                                     'ARTIST HOME LATITUDE', 
                                                     'ARTIST HOME LONGITUDE']
                                                    ].apply(haversine, axis=1)



In [72]:
df.isna().sum()[df.isna().sum() > 0]

ARTIST HOME LATITUDE            1741
ARTIST HOME LONGITUDE           1741
KM DISTANCE (HOME - STATION)    1741
dtype: int64

## Add canadian-content status (met by having at least 2 MAPL ratings)

In [73]:
df.rename(columns={"M-MUSIC":"M", "A-ARTIST": "A", "P-PERFORMANCE":"P","L-LYRICS":"L"}, inplace = True)


In [74]:
df['CANADIAN CONTENT'] = df[['M','A','L','P']].apply(is_can_con, axis=1)

In [75]:
df.to_csv('clean_data.csv')

In [76]:
df.head(10)

Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M,A,P,L,CANADIAN CONTENT
0,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,4,breakestra,us,"los angeles, ca",...,indie,english,yes,mixed group,male,no,no,no,no,no
1,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,5,candy's .22,us,"los angeles, ca",...,indie,english,no,white,male,no,no,no,no,no
2,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,7,dangerdoom,us,"new york, ny",...,indie,english,yes,black,male,no,no,no,no,no
3,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,8,blockhead,us,"new york, ny",...,indie,english,no,white,male,no,no,no,no,no
4,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,9,blackalicious,us,"sacramento, ca",...,indie,english,yes,black,male,no,no,no,no,no
5,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,10,onry ozzborn,us,"seattle, wa",...,indie,english,yes,hispanic,male,no,no,no,no,no
6,2006-01-10,cjsw,calgary,ab,51.08,-114.08,3,unknown,us,unknown,...,indie,english,yes,mixed group,male,no,no,no,no,no
7,2006-01-10,cjsw,calgary,ab,51.08,-114.08,5,psyche origami,us,"atlanta, ga",...,indie,english,yes,mixed group,male,no,no,no,no,no
8,2006-01-10,cjsw,calgary,ab,51.08,-114.08,7,unknown,us,unknown,...,indie,english,yes,mixed group,mixed group,no,no,no,no,no
9,2006-01-10,cjsw,calgary,ab,51.08,-114.08,8,mike ladd,us,"boston, mass",...,indie,english,yes,black,male,no,no,no,no,no


In [77]:
df.isna().sum()

WEEK OF                            0
STATION                            0
STATION CITY                       0
STATION PROVINCE                   0
STATION LATITUDE                   0
STATION LONGITUDE                  0
CHART POSITION                     0
ARTIST NAME(S)                     0
ARTIST COUNTRY                     0
ARTIST HOME CITY                   0
ARTIST HOME LATITUDE            1741
ARTIST HOME LONGITUDE           1741
KM DISTANCE (HOME - STATION)    1741
ALBUM NAME                         0
LABEL NAME                         0
LABEL TYPE                         0
LANGUAGE OF MUSIC                  0
VISIBLE ETHNIC MINORITY            0
CENSUS RACE CLASSIFICATION         0
ARTIST GENDER                      0
M                                  0
A                                  0
P                                  0
L                                  0
CANADIAN CONTENT                   0
dtype: int64

In [78]:
df[df.isna().any(axis=1)]['ARTIST NAME(S)'].unique().shape

(46,)

In [79]:
df[df['ARTIST NAME(S)'] == 'various']

Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M,A,P,L,CANADIAN CONTENT
