In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
from datetime import datetime
import ast
import calendar
import matplotlib.pyplot as plt

In [None]:
# Plotly
import plotly.io as pio
import plotly.graph_objects as go
from plotly.figure_factory import create_distplot
import plotly.express as px
pio.templates["draft"] = go.layout.Template(
    layout_annotations=[
        dict(
            textangle=-30,
            opacity=0.1,
            font=dict(color="black", size=100),
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    ]
)
pio.templates.default = "draft"

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading Data

In [None]:
data = pd.read_csv('/kaggle/input/spotify-daily-top-200-songs-with-genres-20172021/data.csv', sep="#")

In [None]:
data.head()

In [None]:
data.info()

# Processing NaN Values

In [None]:
data.isna().sum()

In [None]:
data.loc[pd.isnull(data['Track Name'])]

In [None]:
# Removing all rows containing a NaN  value
data.dropna(inplace=True) 

# Features Engineering

In [None]:
def get_keyword_separator(s):
    if "featuring" in s.lower():
        return "featuring"
    elif "feat" in s.lower():
        return "feat"
    else:
        return "with"
    
def check_if_song_is_featured(song_name):
    keyword = get_keyword_separator(song_name)
    idx_if_exists = song_name.lower().find(keyword)
    if idx_if_exists != -1:
        feat = song_name[idx_if_exists + len(keyword) + 1:-1]
        sep = "&" if "&" in feat else ","
        return [x.strip() for x in feat.split(sep)]
    else:
        return "None"
    
def get_number_of_singers(feat):
    if isinstance(feat, list):
        return len(feat) + 1
    else:
        return 1
    
def get_song_name_only(song_name):
    sep = 'feat' if 'feat' in song_name.lower() else "with"
    idx = song_name.lower().find(sep)
    if idx != -1:
        return song_name[:idx-1].strip() # -1 to skip the ")"
    else:
        return song_name

In [None]:
data["Year"] = data["Date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year)
data["Month"] = data["Date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").month)
data["DayOfWeek"] = data["Date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").isoweekday())

In [None]:
data["Featuring"] = data["Track Name"].apply(lambda x: check_if_song_is_featured(x))
data['Number Of Singers'] = data['Featuring'].apply(lambda x: get_number_of_singers(x))

In [None]:
data["Track Name"] = data['Track Name'].apply(lambda x: get_song_name_only(x))

In [None]:
data['In Top 5'] = data['Position'].apply(lambda x: int(x<=5))
data['In Top 10'] = data['Position'].apply(lambda x: int(x<=10))
data['In Top 50'] = data['Position'].apply(lambda x: int(x<=50))
data['In Top 100'] = data['Position'].apply(lambda x: int(x<=100))

In [None]:
data["Genre"] = data['Genre'].apply(lambda x: ast.literal_eval(x))

In [None]:
# Checking if there are incorrect Genre names
data["Genre Max Length"] = data['Genre'].apply(lambda x : max([len(xi) for xi in x]))

In [None]:
data['Genre Max Length'].value_counts().plot(kind='bar', title='Genre Name Max Lengths Distribution',
                                         figsize=(14,7), xlabel='Max Length', ylabel='Count')
plt.show()

In [None]:
data.loc[data['Genre Max Length']>30, 'Genre'] = ["Not Precised"]*len(data.loc[data['Genre Max Length']>30, 'Genre'])
data.drop(columns=['Genre Max Length'], inplace=True)

In [None]:
data.reset_index(drop=True,inplace=True)

In [None]:
data

# Exploratory Data Analysis

## Tracks

### Example of Tracks Position Evolution

In [None]:
track_names = ["Bodak Yellow", "Taki Taki", "Despacito", "Shape of You", "All I Want for Christmas Is You",
               "bad guy", "One Kiss", "Dance Monkey", "Peaches", "Señorita", "thank u, next"]

In [None]:
fig = go.Figure()

for name in track_names:
    fig.add_trace(go.Bar(x=data.loc[data['Track Name'] == name, 'Date'],
                         y=data.loc[data['Track Name'] == name, 'Position'],
                         visible=name==track_names[0]))
    
fig.update_layout(title=f"<b>{track_names[0]}</b> - Evolution of Position in Spotify",
                  xaxis_title="Date", yaxis_title="Position")
    
buttons = []
for name in track_names:
    buttons.append(dict(method="update",
                        label=name,
                        args=[{"visible":[n==name for n in track_names]},
                              {"title":f"<b>{name}</b> - Evolution of Position in Spotify"}]
                       ))

fig.update_layout(updatemenus=[{"buttons":buttons, "active":0, "showactive":False, "direction":"down",
                               "x":0.1, "y":1.5}])
    
fig.show()

### Most Lasting in Top Positions

In [None]:
first_position_occurences = data.loc[data['Position']==1].groupby('Track Name').count()
first_position_occurences = first_position_occurences.sort_values(by="Position", ascending=False).reset_index()[:10]

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=first_position_occurences['Track Name'],
                     y=first_position_occurences['Position']))
    
fig.update_layout(title="Spotify Top 10 Tracks Lasting in First Position",
                  xaxis_title="Track Name", yaxis_title="Number of Days")
    
    
fig.show()

In [None]:
fig = go.Figure()

top_positions_idxs = [5, 10, 50, 100]

for t in top_positions_idxs:
    top_positions_occurences = data.loc[data[f'In Top {t}']==1].groupby('Track Name').count()
    top_positions_occurences = top_positions_occurences.sort_values(by="Position", ascending=False).reset_index()[:10]

    fig.add_trace(go.Bar(x=top_positions_occurences['Track Name'],
                         y=top_positions_occurences['Position'],
                         visible=t==top_positions_idxs[0]))
    
fig.update_layout(title=f"Spotify Top 10 Tracks Most Lasting in Top {top_positions_idxs[0]} Positions",
                  xaxis = go.layout.XAxis(tickangle=15, title="Track Name"), yaxis_title="Number of Days", )

buttons = []
for t in top_positions_idxs:
    buttons.append(dict(method="update",
                        label=f"Top {t}",
                        args=[{"visible":[idx==t for idx in top_positions_idxs]},
                              {"title":f"Spotify Top 10 Tracks Most Lasting In Top {t} Positions"}]
                       ))
    
fig.update_layout(updatemenus=[{"buttons":buttons, "active":0, "showactive":False, "direction":"down",
                               "x":0.1, "y":1.5}])

fig.show()

### Most Streamed Tracks

In [None]:
most_streamed_tracks = data.groupby('Track Name').sum().sort_values(by='Streams', ascending=False).reset_index()
most_streamed_tracks = most_streamed_tracks[['Track Name', 'Streams']][:10]

fig = go.Figure()

fig.add_trace(go.Bar(x=most_streamed_tracks['Track Name'],
                     y=most_streamed_tracks['Streams']))
    
fig.update_layout(title="Spotify Top 10 Most Streamed Tracks",
                  xaxis_title="Track Name", yaxis_title="Streams")
    
    
fig.show()

In [None]:
yearly_streams = data.groupby(['Year', 'Track Name']).sum().reset_index()
yearly_streams = yearly_streams.sort_values(by=['Year', 'Streams'], ascending=False)

years = data.Year.unique()

fig = go.Figure()

for year in years:
    year_data = yearly_streams.loc[yearly_streams.Year == year].reset_index(drop=True)[:10]
    fig.add_trace(go.Bar(x=year_data['Track Name'],
                         y=year_data['Streams'],
                         visible=year==years[0]))
    
fig.update_layout(title=f"Spotify Top 10 Most Streamed Tracks in {years[0]}",
                  xaxis = go.layout.XAxis(tickangle=15, title="Track Name"), yaxis_title="Streams")
    
buttons = []
for year in years:
    buttons.append(dict(method="update",
                        label=str(year),
                        args=[{"visible":[y==year for y in years]},
                              {"title":f"Spotify Top 10 Most Streamed Tracks in {year}"}]
                       ))
    
fig.update_layout(updatemenus=[{"buttons":buttons, "active":0, "showactive":False, "direction":"down",
                               "x":0.1, "y":1.5}])
    
    
fig.show()

In [None]:
monthly_streams = data.groupby(['Month', 'Track Name']).sum().reset_index()
monthly_streams = monthly_streams.sort_values(by=['Month', 'Streams'], ascending=False)

months = data.Month.unique()

fig = go.Figure()

for month in months:
    month_data = monthly_streams.loc[monthly_streams.Month == month].reset_index(drop=True)[:10]
    fig.add_trace(go.Bar(x=month_data['Track Name'],
                         y=month_data['Streams'],
                         visible=month==months[0]))
    
fig.update_layout(title=f"Spotify Top 10 Most Streamed Tracks in {calendar.month_name[months[0]]}",
                  xaxis = go.layout.XAxis(tickangle=15, title="Track Name"), yaxis_title="Streams")
    
buttons = []
for month in months:
    buttons.append(dict(method="update",
                        label=calendar.month_name[month],
                        args=[{"visible":[m==month for m in months]},
                              {"title":f"Spotify Top 10 Most Streamed Tracks in {calendar.month_name[month]}"}]
                       ))
    
fig.update_layout(updatemenus=[{"buttons":buttons, "active":0, "showactive":False, "direction":"down",
                               "x":0.1, "y":1.5}])
    
    
fig.show()

In [None]:
daily_records = data.groupby('Track Name')['Streams'].max().reset_index().sort_values(by='Streams', ascending=False)[:10]

fig = go.Figure()

fig.add_trace(go.Bar(x=daily_records['Track Name'],
                     y=daily_records['Streams']))
    
fig.update_layout(title="Spotify Top 10 Most Streamed-In-a-Day Tracks",
                  xaxis_title="Track Name", yaxis_title="Streams")
    
    
fig.show()

In [None]:
daily_records_by_year = data.groupby(['Year', 'Track Name'])['Streams'].max().reset_index()
daily_records_by_year = daily_records_by_year.sort_values(by=['Year', 'Streams'], ascending=False)

fig = go.Figure()

years = data.Year.unique()

fig = go.Figure()

for year in years:
    daily_records_year = daily_records_by_year.loc[daily_records_by_year.Year == year].reset_index(drop=True)[:10]
    fig.add_trace(go.Bar(x=daily_records_year['Track Name'],
                         y=daily_records_year['Streams'],
                         visible=year==years[0]))
    
fig.update_layout(title=f"Spotify Top 10 Most Streamed-in-a-Day Tracks - {years[0]}",
                  xaxis = go.layout.XAxis(tickangle=15, title="Track Name"), yaxis_title="Streams")
    
buttons = []
for year in years:
    buttons.append(dict(method="update",
                        label=str(year),
                        args=[{"visible":[y==year for y in years]},
                              {"title":f"Spotify Top 10 Most Streamed-in-a-Day Tracks - {year}"}]
                       ))
    
fig.update_layout(updatemenus=[{"buttons":buttons, "active":0, "showactive":False, "direction":"down",
                               "x":0.1, "y":1.5}])
    
    
fig.show()

## Artists

### Most Streamed Artists

In [None]:
most_streamed_artists = data.groupby('Artist').sum()['Streams'].reset_index().sort_values(by="Streams", ascending=False)[:10]

fig = go.Figure()

fig.add_trace(go.Bar(x=most_streamed_artists['Artist'],
                     y=most_streamed_artists['Streams']))
    
fig.update_layout(title=f"Spotify Top 10 Streamed Artists",
                  xaxis_title="Artist", yaxis_title="Streams")
    
    
fig.show()

In [None]:
yearly_artist_streams = data.groupby(['Year', 'Artist']).sum()['Streams']
yearly_artist_streams = yearly_artist_streams.reset_index().sort_values(by=['Year', 'Streams'], ascending=False)

df = pd.DataFrame()
for year in sorted(yearly_artist_streams.Year.unique()):
    df = pd.concat([df , yearly_artist_streams.loc[yearly_artist_streams.Year==year][:10]])

fig = px.bar(data_frame=df, x='Artist', y='Streams', animation_frame="Year")

fig.update_layout(title="Spotify Most Streamed Artists By Year", xaxis_title="Artist", yaxis_title="Yearly Streams")

fig.show()

In [None]:
top_10_artists_data = data.loc[data.Artist.isin(most_streamed_artists.Artist.unique())]
top_10_artists_data = top_10_artists_data.groupby(['Artist', 'Date']).sum()['Streams'].reset_index()
top_10_artists_data = top_10_artists_data.sort_values(by=['Artist','Date'])

fig = go.Figure()

for artist in top_10_artists_data.Artist.unique():
    artist_data = top_10_artists_data.loc[top_10_artists_data.Artist == artist]
    fig.add_trace(go.Scatter(x=artist_data['Date'], y=artist_data['Streams'], name=artist))

fig.update_layout(xaxis_title="Date", yaxis_title="Daily Streams", title="Daily Streams Evolution of Spotify Top 10 Artists")

fig.show()

In [None]:
cumsum_streams_data = data.groupby(['Artist', 'Date']).sum().reset_index()
cumsum_streams_data['Cummulative Streams'] = 0
for artist in cumsum_streams_data.Artist.unique():
    artist_df = cumsum_streams_data.loc[cumsum_streams_data.Artist==artist]
    cumsum_streams_data.loc[cumsum_streams_data.Artist==artist, 'Cummulative Streams'] = artist_df['Streams'].cumsum()

In [None]:
top_10_cumsum_data = cumsum_streams_data.loc[cumsum_streams_data.Artist.isin(top_10_artists_data.Artist.unique())]
top_10_cumsum_data = top_10_cumsum_data.sort_values(by=['Artist', 'Date'])

fig = go.Figure()

for artist in top_10_cumsum_data.Artist.unique():
    artist_data = top_10_cumsum_data.loc[top_10_cumsum_data.Artist == artist]
    fig.add_trace(go.Scatter(x=artist_data['Date'], y=artist_data['Cummulative Streams'], name=artist))

fig.update_layout(xaxis_title="Date", yaxis_title="Streams", title="Cumulative Streams Evolution of Spotify Top 10 Artists")

fig.show()