In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('clean_data.csv', index_col=0)
df.head(3)

Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M,A,P,L,CANADIAN CONTENT
0,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,4,breakestra,us,"los angeles, ca",...,indie,english,yes,mixed group,male group,no,no,no,no,no
1,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,5,candy's .22,us,"los angeles, ca",...,indie,english,no,white,male group,no,no,no,no,no
2,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,7,dangerdoom,us,"new york, ny",...,indie,english,yes,black,male,no,no,no,no,no


In [3]:
def expand_column_to_percentiles(column:str, group:str, dataframe:pd.DataFrame, origin:pd.DataFrame):
    for x in sorted(df[column].unique()):
        x_plays = origin[df[column]==x ].groupby(group)[column].count()
        total_plays = origin.groupby(group)[column].count()
        if type(x) == np.float64:
            x = int(x)
        value = (x_plays / total_plays)
        if np.isnan(value).any():
            value = 0
        dataframe[f'{column} {x}'] = value

In [4]:
artist_data = df.groupby('ARTIST NAME(S)').agg(lambda x: pd.Series.mode(x, dropna=False)[0])[
                                                [ 'ARTIST HOME LATITUDE',
                                                  'ARTIST HOME LONGITUDE',
                                                  'ARTIST HOME CITY',
                                                  'ARTIST COUNTRY',
                                                  'LANGUAGE OF MUSIC',
                                                  'VISIBLE ETHNIC MINORITY',
                                                  'CENSUS RACE CLASSIFICATION',
                                                  'ARTIST GENDER',
                                                  'CANADIAN CONTENT']
                                                ]
artist_data['ALBUMS PLAYED'] =  df.groupby('ARTIST NAME(S)')['ALBUM NAME'].nunique()
artist_data['STATIONS PLAYED ON'] =  df.groupby('ARTIST NAME(S)')['STATION'].nunique()
artist_data['MEAN CHART POSITION'] =  df.groupby('ARTIST NAME(S)')['CHART POSITION'].mean().round().astype(int)
artist_data['TOTAL DAYS PLAYED'] =  df.groupby('ARTIST NAME(S)')['ALBUM NAME'].count()

expand_column_to_percentiles('STATION', 'ARTIST NAME(S)', artist_data, df)
expand_column_to_percentiles('CHART POSITION', 'ARTIST NAME(S)', artist_data, df)


In [5]:
artist_data.reset_index().to_csv('artist_table.csv')

In [6]:
station_data = df.groupby('STATION').agg(lambda x: pd.Series.mode(x, dropna=False)[0])[
                                                [ 'STATION LATITUDE',
                                                  'STATION LONGITUDE',
                                                  'STATION CITY',
                                                  'STATION PROVINCE',
                                                  'ARTIST HOME CITY',
                                                  'ARTIST COUNTRY',
                                                  'LABEL TYPE',
                                                  'LABEL NAME'
                                                  ]
                                                ]

expand_column_to_percentiles('ARTIST GENDER', 'STATION', station_data, df)
expand_column_to_percentiles('CENSUS RACE CLASSIFICATION', 'STATION', station_data, df)
expand_column_to_percentiles('ARTIST COUNTRY', 'STATION', station_data, df)
expand_column_to_percentiles('CANADIAN CONTENT', 'STATION', station_data, df)
station_data.rename(columns={ 'ARTIST HOME CITY' : 'MOST PLAYED ARTIST CITY',
                              'LABEL TYPE' : 'MOST PLAYED LABEL TYPE',
                              'LABEL NAME' : 'MOST PLAYED LABEL ',
                              'yes' : 'CAN CON', 
                              'no' : 'NOT CAN CON'

                             }, inplace=True)



In [7]:
station_data.reset_index().to_csv('station_table.csv')

In [9]:
weekly_plays = df.groupby(['WEEK OF','STATION','ARTIST NAME(S)'])['ARTIST NAME(S)'].count().reset_index(name='count')
weekly_plays['count'].value_counts()

1    35236
2      273
3       24
4        1
Name: count, dtype: int64

In [11]:
spider_map_data = pd.DataFrame(columns=['STATION-ARTIST', 'STATION', 'ARTIST', 'PATH ID', 'LATITUDE','LONGITUDE', 'WEEK OF', 'PLAYS'])
def build_spider_map_dataframe():
    rows = []
    
    for date in df['WEEK OF'].unique():
        for station in df[df['WEEK OF'] == date]['STATION'].unique():
            for artist in df[(df['WEEK OF'] == date) & (df['STATION'] == station)]['ARTIST NAME(S)'].unique():
                
                station = station
                station_lat = station_data.loc[station]['STATION LATITUDE']
                station_long = station_data.loc[station]['STATION LONGITUDE']
                artist = artist
                artist_lat = artist_data.loc[artist]['ARTIST HOME LATITUDE']
                artist_long = artist_data.loc[artist]['ARTIST HOME LONGITUDE']
                week = date
                plays = df[(df['WEEK OF'] == date) & (df['STATION'] == station)]['ARTIST NAME(S)'].value_counts()[artist]
                origin = {'STATION-ARTIST':'station', 
                        'STATION': station, 
                        'ARTIST': artist, 
                        'PATH ID':f'{station}-{artist}', 
                        'LATITUDE':station_lat,
                        'LONGITUDE':station_long, 
                        'WEEK OF':week, 
                        'PLAYS': 0}
                
                destination = {'STATION-ARTIST':'artist', 
                        'STATION': station, 
                        'ARTIST': artist, 
                        'PATH ID':f'{station}-{artist}', 
                        'LATITUDE':artist_lat,
                        'LONGITUDE':artist_long, 
                        'WEEK OF':week, 
                        'PLAYS' : plays}
                if not np.isnan([artist_lat,artist_long]).any():
                    rows.append(origin)
                    rows.append(destination)
        
    return pd.DataFrame(rows)
        

In [12]:
spider_map_data  = build_spider_map_dataframe()


spider_map_data.to_csv('spider_map.csv')

In [13]:
spider_map_data

Unnamed: 0,STATION-ARTIST,STATION,ARTIST,PATH ID,LATITUDE,LONGITUDE,WEEK OF,PLAYS
0,station,cjsr,breakestra,cjsr-breakestra,53.55,-113.50,2006-01-10,0
1,artist,cjsr,breakestra,cjsr-breakestra,34.05,-118.24,2006-01-10,1
2,station,cjsr,candy's .22,cjsr-candy's .22,53.55,-113.50,2006-01-10,0
3,artist,cjsr,candy's .22,cjsr-candy's .22,34.05,-118.24,2006-01-10,1
4,station,cjsr,dangerdoom,cjsr-dangerdoom,53.55,-113.50,2006-01-10,0
...,...,...,...,...,...,...,...,...
68089,artist,ckut,swollen members,ckut-swollen members,49.25,-123.13,2010-03-02,1
68090,station,ckut,chokeules,ckut-chokeules,45.50,-73.58,2010-03-02,0
68091,artist,ckut,chokeules,ckut-chokeules,43.66,-79.42,2010-03-02,1
68092,station,ckut,radio radio,ckut-radio radio,45.50,-73.58,2010-03-02,0


In [118]:
spider_map_data.isna().sum()

STATION-ARTIST    0
STATION           0
ARTIST            0
PATH ID           0
LATITUDE          0
LONGITUDE         0
WEEK OF           0
PLAYS             0
dtype: int64

In [114]:
spider_map_data['PLAYS'].value_counts()

0    34047
1    34008
2       39
Name: PLAYS, dtype: int64