In [45]:
import pandas as pd
import numpy as np


In [46]:
df = pd.read_csv('clean_data.csv', index_col=0)
df.head(3)

Unnamed: 0,WEEK OF,STATION,STATION CITY,STATION PROVINCE,STATION LATITUDE,STATION LONGITUDE,CHART POSITION,ARTIST NAME(S),ARTIST COUNTRY,ARTIST HOME CITY,...,LABEL TYPE,LANGUAGE OF MUSIC,VISIBLE ETHNIC MINORITY,CENSUS RACE CLASSIFICATION,ARTIST GENDER,M,A,P,L,CANADIAN CONTENT
0,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,4.0,breakestra,us,"los angeles, ca",...,indie,english,yes,mixed group,male group,no,no,no,no,no
1,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,5.0,candy's .22,us,"los angeles, ca",...,indie,english,no,white,male group,no,no,no,no,no
2,2006-01-10,cjsr,edmonton,ab,53.55,-113.5,7.0,dangerdoom,us,"new york, ny",...,indie,english,yes,black,male,no,no,no,no,no


In [47]:
def expand_column_to_percentiles(column:str, group:str, dataframe:pd.DataFrame, origin:pd.DataFrame):
    for x in sorted(df[column].unique()):
        x_plays = origin[df[column]==x ].groupby(group)[column].count()
        total_plays = origin.groupby(group)[column].count()
        if type(x) == np.float64:
            x = int(x)
        dataframe[f'{column} {x}'] = (x_plays / total_plays)

In [48]:
artist_data = df.groupby('ARTIST NAME(S)').agg(lambda x: x.value_counts().index[0])[
                                                [ 'ARTIST HOME LATITUDE',
                                                  'ARTIST HOME LONGITUDE',
                                                  'ARTIST HOME CITY',
                                                  'ARTIST COUNTRY',
                                                  'LANGUAGE OF MUSIC',
                                                  'VISIBLE ETHNIC MINORITY',
                                                  'CENSUS RACE CLASSIFICATION',
                                                  'ARTIST GENDER',
                                                  'CANADIAN CONTENT']
                                                ]
artist_data['ALBUMS PLAYED'] =  df.groupby('ARTIST NAME(S)')['ALBUM NAME'].nunique()
artist_data['STATIONS PLAYED ON'] =  df.groupby('ARTIST NAME(S)')['STATION'].nunique()
artist_data['MEAN CHART POSITION'] =  df.groupby('ARTIST NAME(S)')['CHART POSITION'].mean().round().astype(int)
artist_data['TOTAL DAYS PLAYED'] =  df.groupby('ARTIST NAME(S)')['ALBUM NAME'].count()

expand_column_to_percentiles('STATION', 'ARTIST NAME(S)', artist_data, df)
expand_column_to_percentiles('CHART POSITION', 'ARTIST NAME(S)', artist_data, df)

artist_data.replace(np.nan,0, inplace=True)

In [51]:
artist_data.reset_index().to_csv('artist_table.csv')

In [52]:
station_data = df.groupby('STATION').agg(lambda x: x.value_counts().index[0])[
                                                [ 'STATION LATITUDE',
                                                  'STATION LONGITUDE',
                                                  'STATION CITY',
                                                  'STATION PROVINCE',
                                                  'ARTIST HOME CITY',
                                                  'ARTIST COUNTRY',
                                                  'LABEL TYPE',
                                                  'LABEL NAME'
                                                  ]
                                                ]

expand_column_to_percentiles('ARTIST GENDER', 'STATION', station_data, df)
expand_column_to_percentiles('CENSUS RACE CLASSIFICATION', 'STATION', station_data, df)
expand_column_to_percentiles('ARTIST COUNTRY', 'STATION', station_data, df)
expand_column_to_percentiles('CANADIAN CONTENT', 'STATION', station_data, df)
station_data.rename(columns={ 'ARTIST HOME CITY' : 'MOST PLAYED ARTIST CITY',
                              'LABEL TYPE' : 'MOST PLAYED LABEL TYPE',
                              'LABEL NAME' : 'MOST PLAYED LABEL ',
                              'yes' : 'CAN CON', 
                              'no' : 'NOT CAN CON'

                             }, inplace=True)

station_data.replace(np.nan,0, inplace=True)


In [53]:
station_data.reset_index().to_csv('station_table.csv')

In [69]:
spider_map_data = pd.DataFrame(columns=['STATION-ARTIST', 'STATION', 'ARTIST', 'PATH ID', 'LATITUDE','LONGITUDE', 'WEEK OF'])

In [80]:
def build_spider_map_dataframe():
    rows = []
    for row in df.values:
        station = row[1]
        station_lat = station_data.loc[station]['STATION LATITUDE']
        station_long = station_data.loc[station]['STATION LONGITUDE']
        artist = row[7]
        artist_lat = artist_data.loc[artist]['ARTIST HOME LATITUDE']
        artist_long = artist_data.loc[artist]['ARTIST HOME LONGITUDE']
        week = row[0]
        
        origin = {'STATION-ARTIST':'station', 
                'STATION': station, 
                'ARTIST': artist, 
                'PATH ID':f'{station}-{artist}', 
                'LATITUDE':station_lat,
                'LONGITUDE':station_long, 
                'WEEK OF':week}
        
        destination = {'STATION-ARTIST':'artist', 
                'STATION': station, 
                'ARTIST': artist, 
                'PATH ID':f'{station}-{artist}', 
                'LATITUDE':artist_lat,
                'LONGITUDE':artist_long, 
                'WEEK OF':week}
        rows.append(origin)
        rows.append(destination)
        
    return pd.DataFrame(rows)
        

In [81]:
spider_map_data  = build_spider_map_dataframe()
spider_map_data.to_csv('spider_map.csv')