# Aggregate Data and Rank Functions using Pandas

In [14]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv("Top 100 most Streamed - Sheet5.csv")
df.head()

Unnamed: 0,title,artist,top genre,year,beats.per.minute,energy,danceability,loudness.dB,liveness,valance,length,acousticness,speechiness,popularity
0,Blinding Lights,The Weeknd,canadian contemporary r&b,2020,171,73,51,-6,9,33,200,0,6,91
1,Watermelon Sugar,Harry Styles,pop,2019,95,82,55,-4,34,56,174,12,5,88
2,Mood (feat. iann dior),24kGoldn,cali rap,2021,91,72,70,-4,32,73,141,17,4,88
3,Someone You Loved,Lewis Capaldi,pop,2019,110,41,50,-6,11,45,182,75,3,86
4,Perfect,Ed Sheeran,pop,2017,95,45,60,-6,11,17,263,16,2,86


In [18]:
df.shape

(100, 14)

In [20]:
df.isnull().sum()

title               0
artist              0
top genre           0
year                0
beats.per.minute    0
energy              0
danceability        0
loudness.dB         0
liveness            0
valance             0
length              0
acousticness        0
speechiness         0
popularity          0
dtype: int64

In [22]:
df.dtypes

title               object
artist              object
top genre           object
year                 int64
beats.per.minute     int64
energy               int64
danceability         int64
loudness.dB          int64
liveness             int64
valance              int64
length               int64
acousticness         int64
speechiness          int64
popularity           int64
dtype: object

In [24]:
df.columns

Index(['title', 'artist', 'top genre', 'year', 'beats.per.minute', 'energy',
       'danceability', 'loudness.dB', 'liveness', 'valance', 'length',
       'acousticness', 'speechiness', 'popularity'],
      dtype='object')

# Aggregate Functions

It is used along with groupby functions to fetch aggregate function values like<br> 
sum, count, min, max, mean, var, std, median

Syntax<br>
agg([agg_fun1,agg_fun2...])<br>
agg({col1:[agg_fun1,agg_fun2...],col2:[agg_fun1,agg_fun2...]})

#### Q) Find sum, min, max, mean of energy and popularity based on year

In [34]:
r1 = df.groupby('year')[['energy','popularity']].agg({'sum','min','max','mean'})
r1

Unnamed: 0_level_0,energy,energy,energy,energy,popularity,popularity,popularity,popularity
Unnamed: 0_level_1,sum,mean,max,min,sum,mean,max,min
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1975,40,40.0,40,40,81,81.0,81,81
1995,90,90.0,90,90,77,77.0,77,77
2004,91,91.0,91,91,81,81.0,81,81
2008,46,46.0,46,46,80,80.0,80,80
2012,241,80.333333,92,71,238,79.333333,82,76
2013,229,57.25,78,26,322,80.5,84,72
2014,338,56.333333,74,38,478,79.666667,83,70
2015,846,65.076923,81,38,1006,77.384615,83,66
2016,1019,63.6875,87,40,1238,77.375,84,53
2017,776,64.666667,82,37,988,82.333333,86,79


#### Q) Find mean, median, variance and standard deviation of length and beats.per.minute based on year, for year 2016 and onwards

In [37]:
df_2016 = df[df['year']>=2016]
df_2016.shape

(70, 14)

In [47]:
r2 = df_2016.groupby('year')[['length','beats.per.minute']].agg({'mean','median','var','std'})
r2

Unnamed: 0_level_0,length,length,length,length,beats.per.minute,beats.per.minute,beats.per.minute,beats.per.minute
Unnamed: 0_level_1,median,std,var,mean,median,std,var,mean
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2016,215.0,19.289786,372.095833,216.6875,108.5,28.041636,786.333333,117.25
2017,202.5,38.261857,1463.969697,203.833333,109.5,28.189161,794.628788,117.916667
2018,214.5,33.625022,1130.642105,213.3,102.5,25.869053,669.207895,111.05
2019,196.5,22.038981,485.716667,199.125,118.5,26.802674,718.383333,123.375
2020,183.0,10.115994,102.333333,188.333333,124.0,40.673497,1654.333333,128.333333
2021,172.0,24.337899,592.333333,167.333333,91.0,20.207259,408.333333,102.666667


### Q) Find min and max for popularity and energy, mean and standard deviations for liveness and acousticness based on year, for 2016 nd onwards 

In [57]:
# popularity and energy - min and max
# liveness and acousticness - mean and std
# based on year 2016 and onwards

In [43]:
df_2016.shape

(70, 14)

In [61]:
# In agg function , we will be using a dict.
# Key of the dict will be the col name
# Value of the dict is going to be list containing differ agg functions
# agg functions like -sum, min, max, mean, var, std, median

r3 = df_2016.groupby('year').agg({'energy': ['min', 'max'],'popularity': ['min', 'max'],
                                  'liveness': ['mean', 'std'],'acousticness': ['mean', 'std']})
r3


Unnamed: 0_level_0,energy,energy,popularity,popularity,liveness,liveness,acousticness,acousticness
Unnamed: 0_level_1,min,max,min,max,mean,std,mean,std
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2016,40,87,53,84,16.1875,8.871067,18.5625,21.57767
2017,37,82,79,86,13.583333,5.946096,17.583333,24.133938
2018,30,86,56,86,17.1,11.827266,29.45,25.394726
2019,11,82,67,88,17.625,18.521609,31.0625,28.990731
2020,69,79,81,91,9.666667,0.57735,8.666667,14.153916
2021,33,83,66,88,26.666667,16.653328,35.333333,42.571508


#### Q) Find min and max for beats.per.minute and liveness, variance and standard deviation for energy and danceability based on top 10 genres

In [66]:
# beats.per.minute and liveness - min and max
# energy and danceability - var and std
# based on top 10 genres

In [70]:
top10_gen = df['top genre'].value_counts()
top10_gen = top10_gen.sort_values(ascending = False).head(10)
top10_gen 

# top 10 genres write their count in desc order

top genre
dance pop                    28
pop                          11
dfw rap                       7
modern rock                   6
canadian pop                  6
canadian contemporary r&b     4
electropop                    4
melodic rap                   3
canadian hip hop              2
rap                           2
Name: count, dtype: int64

In [72]:
top10_gen.index

Index(['dance pop', 'pop', 'dfw rap', 'modern rock', 'canadian pop',
       'canadian contemporary r&b', 'electropop', 'melodic rap',
       'canadian hip hop', 'rap'],
      dtype='object', name='top genre')

In [76]:
r4 = df[df['top genre'].isin(top10_gen.index)].groupby('top genre').agg({'beats.per.minute': ['min', 'max'],'liveness': ['min', 'max'],
                                  'energy': ['var', 'std'],'danceability': ['var', 'std']})
r4

Unnamed: 0_level_0,beats.per.minute,beats.per.minute,liveness,liveness,energy,energy,danceability,danceability
Unnamed: 0_level_1,min,max,min,max,var,std,var,std
top genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
canadian contemporary r&b,108,186,9,14,106.25,10.307764,82.25,9.069179
canadian hip hop,77,104,32,55,128.0,11.313708,8.0,2.828427
canadian pop,83,150,5,30,282.666667,16.812694,207.066667,14.389811
dance pop,80,160,3,56,242.522487,15.573133,106.818783,10.335317
dfw rap,90,160,7,25,152.952381,12.367392,59.238095,7.696629
electropop,83,135,9,12,572.666667,23.930455,276.333333,16.623277
melodic rap,117,155,11,79,72.333333,8.504901,79.0,8.888194
modern rock,90,170,6,67,240.166667,15.497312,179.1,13.382825
pop,79,125,8,34,274.490909,16.567767,253.254545,15.913973
rap,130,155,12,15,0.0,0.0,0.5,0.707107


#### Q) For the artist Post Malone, Ed Sheeran and Imagine Gragons,The Weeknd and The Chiansmokers, display max of energy, sum of length, mean of popularity, mean of danceability, variance of beats.per.minute. Rename the columns accordingly.

In [82]:
artists_req = ['Post Malone','Ed Sheeran','Imagine Gragons','The Weeknd','The Chiansmokers']
artists_req

['Post Malone',
 'Ed Sheeran',
 'Imagine Gragons',
 'The Weeknd',
 'The Chiansmokers']

#### Syntax for renaming columns
agg(NewColName1 = (col1,agg_fun),NewColName1 =(col1,agg_fun)) .... NewColNamen = (col1,agg_fun))

In [88]:
r5 = df[df["artist"].isin(artists_req)].groupby('artist').agg(MaXEnergy=('energy', 'max'),
                                                              SumLength=('length', 'sum'),
                                                              MeanPopularity=('popularity', 'mean'),
                                                              MeanDanceability=('danceability', 'mean'),
                                                              VarBeatPerMin=('beats.per.minute', 'var'))
r5

Unnamed: 0_level_0,MaXEnergy,SumLength,MeanPopularity,MeanDanceability,VarBeatPerMin
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ed Sheeran,68,1258,83.2,72.4,117.5
Post Malone,80,1486,81.285714,66.714286,524.904762
The Weeknd,77,886,83.75,62.25,1583.0


# Rank Functions

#### Q) Rank top 10 genres wrt count based on their mean Energy

In [98]:
top10_gen

top genre
dance pop                    28
pop                          11
dfw rap                       7
modern rock                   6
canadian pop                  6
canadian contemporary r&b     4
electropop                    4
melodic rap                   3
canadian hip hop              2
rap                           2
Name: count, dtype: int64

In [111]:
r6 = df[df['top genre'].isin(top10_gen.index)].groupby('top genre').agg(MeanEnergy =('energy','mean'))
r6

Unnamed: 0_level_0,MeanEnergy
top genre,Unnamed: 1_level_1
canadian contemporary r&b,66.25
canadian hip hop,53.0
canadian pop,63.666667
dance pop,63.178571
dfw rap,60.571429
electropop,38.0
melodic rap,68.666667
modern rock,68.833333
pop,55.090909
rap,73.0


In [107]:
r6['Rank_asc'] = r6['MeanEnergy'].rank(ascending = True)
r6['Rank_desc'] = r6['MeanEnergy'].rank(ascending = False)
r6
r6.sort_values(by = 'MeanEnergy')

Unnamed: 0_level_0,MeanEnergy,Rank_asc,Rank_desc
top genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
electropop,38.0,1.0,10.0
canadian hip hop,53.0,2.0,9.0
pop,55.090909,3.0,8.0
dfw rap,60.571429,4.0,7.0
dance pop,63.178571,5.0,6.0
canadian pop,63.666667,6.0,5.0
canadian contemporary r&b,66.25,7.0,4.0
melodic rap,68.666667,8.0,3.0
modern rock,68.833333,9.0,2.0
rap,73.0,10.0,1.0


#### Q) Rank top 10 genre wrt count based top genre for on the count of length of songs. Length of songs needs to be ranked

In [120]:
top10_gen.index
# top 10 genres based on desc order of count

Index(['dance pop', 'pop', 'dfw rap', 'modern rock', 'canadian pop',
       'canadian contemporary r&b', 'electropop', 'melodic rap',
       'canadian hip hop', 'rap'],
      dtype='object', name='top genre')

In [125]:
r7 = df[df['top genre'].isin(top10_gen.index)].groupby('top genre').agg(CountLength = ('length','count'))
r7

Unnamed: 0_level_0,CountLength
top genre,Unnamed: 1_level_1
canadian contemporary r&b,4
canadian hip hop,2
canadian pop,6
dance pop,28
dfw rap,7
electropop,4
melodic rap,3
modern rock,6
pop,11
rap,2


In [142]:
r7['Rank_asc'] = r7['CountLength'].rank(ascending = True)
r7['DRank_asc'] = r7['CountLength'].rank(method='dense',ascending = True)
r7['Rank_desc'] = r7['CountLength'].rank(ascending = False)
r7['DRank_desc'] = r7['CountLength'].rank(method='dense',ascending = False)
r7

Unnamed: 0_level_0,CountLength,Rank_asc,Denserank_asc,DRank_asc,Rank_desc,DRank_desc
top genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
canadian contemporary r&b,4,4.5,5.0,3.0,6.5,5.0
canadian hip hop,2,1.5,7.0,1.0,9.5,7.0
canadian pop,6,6.5,4.0,4.0,4.5,4.0
dance pop,28,10.0,1.0,7.0,1.0,1.0
dfw rap,7,8.0,3.0,5.0,3.0,3.0
electropop,4,4.5,5.0,3.0,6.5,5.0
melodic rap,3,3.0,6.0,2.0,8.0,6.0
modern rock,6,6.5,4.0,4.0,4.5,4.0
pop,11,9.0,2.0,6.0,2.0,2.0
rap,2,1.5,7.0,1.0,9.5,7.0


## Sample Example

In [134]:
data = { 'x' : [100,101,102,101,103,105,107,107,105,99]}
w = pd.DataFrame(data)
w.head()

Unnamed: 0,x
0,100
1,101
2,102
3,101
4,103


In [140]:
w['Rank_asc'] = w['x'].rank(ascending = True)
w['Rank_dessc'] = w['x'].rank(ascending = False)
w['DRank_asc'] = w['x'].rank(method='dense',ascending = True)
w['DRank_desc'] = w['x'].rank(method='dense',ascending = False)
w

Unnamed: 0,x,Rank_asc,DRank_asc,Rank_dessc,DRank_desc
0,100,2.0,2.0,9.0,6.0
1,101,3.5,3.0,7.5,5.0
2,102,5.0,4.0,6.0,4.0
3,101,3.5,3.0,7.5,5.0
4,103,6.0,5.0,5.0,3.0
5,105,7.5,6.0,3.5,2.0
6,107,9.5,7.0,1.5,1.0
7,107,9.5,7.0,1.5,1.0
8,105,7.5,6.0,3.5,2.0
9,99,1.0,1.0,10.0,7.0


In [148]:
# These rank functions are in ascending order
w['MaxRank'] = w['x'].rank(method = 'max')  # ascending = True
w['MinRank'] = w['x'].rank(method = 'min')  # ascending = True
w['MeanRank'] = w['x'].rank(method = 'average')   # ascending = True
w

Unnamed: 0,x,Rank_asc,DRank_asc,Rank_dessc,DRank_desc,MaxRank,MinRank,MeanRank
0,100,2.0,2.0,9.0,6.0,2.0,2.0,2.0
1,101,3.5,3.0,7.5,5.0,4.0,3.0,3.5
2,102,5.0,4.0,6.0,4.0,5.0,5.0,5.0
3,101,3.5,3.0,7.5,5.0,4.0,3.0,3.5
4,103,6.0,5.0,5.0,3.0,6.0,6.0,6.0
5,105,7.5,6.0,3.5,2.0,8.0,7.0,7.5
6,107,9.5,7.0,1.5,1.0,10.0,9.0,9.5
7,107,9.5,7.0,1.5,1.0,10.0,9.0,9.5
8,105,7.5,6.0,3.5,2.0,8.0,7.0,7.5
9,99,1.0,1.0,10.0,7.0,1.0,1.0,1.0


In [152]:
w['MaxRank_desc'] = w['x'].rank(method = 'max')  # ascending = False
w['MinRank_desc'] = w['x'].rank(method = 'min')  # ascending = False
w

Unnamed: 0,x,Rank_asc,DRank_asc,Rank_dessc,DRank_desc,MaxRank,MinRank,MeanRank,MaxRank_desc,MinRank_desc
0,100,2.0,2.0,9.0,6.0,2.0,2.0,2.0,2.0,2.0
1,101,3.5,3.0,7.5,5.0,4.0,3.0,3.5,4.0,3.0
2,102,5.0,4.0,6.0,4.0,5.0,5.0,5.0,5.0,5.0
3,101,3.5,3.0,7.5,5.0,4.0,3.0,3.5,4.0,3.0
4,103,6.0,5.0,5.0,3.0,6.0,6.0,6.0,6.0,6.0
5,105,7.5,6.0,3.5,2.0,8.0,7.0,7.5,8.0,7.0
6,107,9.5,7.0,1.5,1.0,10.0,9.0,9.5,10.0,9.0
7,107,9.5,7.0,1.5,1.0,10.0,9.0,9.5,10.0,9.0
8,105,7.5,6.0,3.5,2.0,8.0,7.0,7.5,8.0,7.0
9,99,1.0,1.0,10.0,7.0,1.0,1.0,1.0,1.0,1.0
