# Spotify Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_tracks = pd.read_csv('tracks.csv')

In [None]:
df_tracks.info()

In [None]:
df_tracks.head()

In [None]:
# Checking null values

pd.isnull(df_tracks).sum()

In [None]:
# Top 10 least popular songs
sorted_df = df_tracks.sort_values('popularity', ascending=True).head(10)
sorted_df

In [None]:
df_tracks.describe().transpose()

In [None]:
most_popular = df_tracks.query('popularity>90', inplace=False).sort_values('popularity', ascending=False)
most_popular[:10]

In [None]:
df_tracks.set_index('release_date', inplace=True)
df_tracks.index=pd.to_datetime(df_tracks.index)
df_tracks.head()

In [None]:
# Indicing along the columns and indexes

df_tracks[['artists', 'name']].iloc[18]

In [None]:
# Converting the duration in miliseconds into seconds

df_tracks["duration"] = df_tracks["duration_ms"].apply(lambda x: round(x/1000))
df_tracks.drop("duration_ms", inplace=True, axis=1)

In [None]:
df_tracks.head()

In [None]:
# The most popular artist and song
# Which artist has the most listener on Spotify?

In [None]:
#Finding the correlation between the numerical features
corr_df = df_tracks.drop(['key', 'mode', 'explicit'], axis=1).corr(method='pearson')

#Plotting some visualization
plt.figure(figsize=(14,6))

heatmap = sns.heatmap(corr_df, annot=True, fmt=".1g", vmin=-1, vmax=1, center=0, cmap="inferno", linewidths=1, linecolor="Black")
heatmap.set_title("Correlation HeatMap Between Variable")
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90)

In [None]:
# Taking 0.4% smaple from the population for further analysis
sample_df = df_tracks.sample(int(0.004*len(df_tracks)))

In [None]:
sample_df.head()

In [None]:
# Making regression plot between energy and loudness
# Because in our population heatmap we saw that energy and loudness has 0.8 correlation between them
# Let's find out the same for the sample size

plt.figure(figsize=(10,6))
sns.regplot(data=sample_df, y="loudness", x="energy", color="c").set(title="Loudness Vs. Energy Correlation")

In [None]:
# Let's find out the regression between popularity and acousticness

plt.figure(figsize=(10,6))
sns.regplot(data=sample_df, y="popularity", x="acousticness", color="b").set(title="Popularity Vs. Acousticness Correlation")

In [None]:
# Converting 'release_date' indexing to 'datetime' indexing

df_tracks['dates'] = df_tracks.index.get_level_values('release_date')
df_tracks.dates = pd.to_datetime(df_tracks.dates)
years = df_tracks.dates.dt.year

In [None]:
# Making a histogram for finding out the total number of songs against every year

sns.displot(years, discrete=True, aspect=2, height=5, kind='hist').set(title='Number of songs per year')

In [None]:
# Now doing the regression for the duration of the song
# 'fig_dims' stands for figure dimensions
# 'fig, ax' stands for figure and axes

total_dr = df_tracks.duration
fig_dims = (18, 7)
fig, ax = plt.subplots(figsize=fig_dims)
fig = sns.barplot(x=years, y=total_dr, ax=ax, errwidth=False).set(title="Year Vs. Duration")
plt.xticks(rotation=90)

In [None]:
# Let's find out the average song duration against years by using the line plot

total_dr = df_tracks.duration
sns.set_style(style='whitegrid')
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
fig = sns.lineplot(x=years, y=total_dr, ax=ax).set(title="Years Vs. Duartion")
plt.xticks(rotation=60)

In [None]:
df_genre = pd.read_csv('SpotifyFeatures.csv')