# Exploring Sample Spotify Dataset to get Insights using Pandas & Matplotlib

##### Requirements step (Uncomment the below code to install missing libary if not using jupyter or anaconda installation)

In [None]:
# !pip install pandas numpy matplotlib

#### Import the libraries

In [None]:
import pandas as pd  # Data Preprocessing
import numpy as np   # Mathematical Computation
import matplotlib.pyplot as plt  # Vizualization

#### Read the dataset

In [None]:
df = pd.read_csv('../data/processed/Spotify Top 100 most Streamed.csv')
# df is a Dataframe
df.head()  # top 5 rows

In [None]:
df.shape
# rows = 100, columns = 14

### Data Preprocessing

#### 1) Handle the Null Values

In [None]:
df.isnull().sum()

#### Inference
Data does not conatin any null value

#### 2) Handle Duplicates

In [None]:
df.duplicated().sum()

#### Inference
Data does not conatin any duplicated records

#### 3) Check data types

In [None]:
col = df.columns
print(col)

In [None]:
df.dtypes

#### Data Analysis using Pandas

#### 1) Top N Artists with most songs in top 100 spotify songs

In [None]:
def topN_feat_with_most_songs(feat, N):
    res = df[feat].value_counts().head(N)
    return res

In [None]:
# def topN_Artists_with_most_songs(N):
#     res = df['artist'].value_counts().head(N)
#     return res

In [None]:
topN_feat_with_most_songs('artist', 15)

#### 2) Top N Genres with most songs in top 100 spotify songs

In [None]:
col

In [None]:
topN_feat_with_most_songs('top genre', 10)

#### 3) Percentage Contribution of top genres in Top N Artists with most songs

In [None]:
top7_artist = topN_feat_with_most_songs('artist', 7).index

top7_artists_df = df[df['artist'].isin(top7_artist)]
top7_artists_df.head()

In [None]:
top7_artists_df['top genre'].value_counts(normalize=True)*100

In [None]:
print(8/29)
print(7/29)

#### 4) Top 5 year contributing most to the spotify top 100 playlist

In [None]:
topN_feat_with_most_songs('year', 5)

In [None]:
print(col)

#### 5)  Compute Top Genre wise mean of Energy and Polularity

In [None]:
# T is transpose
res5 = df.groupby('top genre')[['energy', 'popularity']].mean().T
res5

#### 6) Compute Top Genre wise min, max, mean of Energy and mean and median  of liveness

In [None]:
# agg = aggregate
res6 = df.groupby('top genre').agg({'energy':  ['min', 'max', 'mean'],
                                    'liveness': ['mean', 'median']})
res6.head()

#### 7) Compute artist wise min, max and mean of popularity and mean & count of liveness

In [None]:
# Std-Deviation = Deviation of data from the mean =>
# sqrt(((xi - mean)^2/N))

In [None]:
print(col)

In [None]:
res7 = df.groupby('artist').agg({'popularity': [min, max, 'mean'],
                                 'liveness': ['mean', 'count']})
res7.head()

#### 8) Generate the statistical summary of the data

In [None]:
# Statistical summary consists of count,mean,std, min,25%,50%,75% and max
# Statistical summary is only generated for numerical columns
df.describe()

In [None]:
df[df['year'] == 1975]

In [None]:
plt.boxplot(df['energy'])
plt.show()

#### 8) Capture Outliers in Energy

##### Quartile Function & Formulas

```
Q1 = 1st Quartile = 25 percentile
Q2 = 2nd Quartile = 50 percentile
Q3 = 3rd Quartile = 75 percentile
IQR = Inter Quartile Range = Q3 - Q1
bmin = Q1 - 1.5*%IQR
bmax = Q3 + 1.5*%IQR
Outliers are data points > bmax or < bmin

Reliance Mart -> Avg Daily Sales - 20-30K
1 or 2 days before 15Aug => avg daily sales = 45-55K
3 days before RakshaBandhan => avg daily sales = 55k-70K
next day after RakshaBandhan => avg daily sales = 8K-12K
Dec-Jan and July-Aug
```

In [None]:
en_q1 = df['energy'].quantile(0.25)
en_q3 = df['energy'].quantile(0.75)
iqr = en_q3 - en_q1
en_min, en_max = en_q1 - 1.5*iqr, en_q3 + 1.5*iqr,
en_outliers = df[(df['energy'] > en_max) | (df['energy'] < en_min)]
en_outliers

#### EDA(Exploratory Data Analysis) using Matplotlib and Pandas

#### 1) Depict Top N Artists with most songs in top 100 spotify songs

In [None]:
def plot_topN_col(feat, N, featname):
    res = df[feat].value_counts().head(N)
    plt.barh(res.index, res.values, color='orange', edgecolor='black')
    plt.title(f'Top {N} {featname} with most songs spotify top 100')
    plt.xlabel('Count')
    plt.ylabel(f'{featname} Name')
    plt.grid()
    plt.show()

#### Bar Chart

In [None]:
plot_topN_col('artist', 7, 'Artist')

In [None]:
plot_topN_col('top genre', 10, 'Genre')

#### 3) Depict Percentage Contribution of top genres in Top 7 Artists with most songs

In [None]:
pres3 = top7_artists_df['top genre'].value_counts()
pres3

#### Pie Chart

In [None]:
# autopct = auto percentage
plt.pie(pres3.values, labels=pres3.index, autopct='%.0f%%')
plt.title('Percentage Contribution of top genres in Top 7 Artists with most songs')
plt.show()

####  Depict year wise number of songs in Spotify top 100

In [None]:
pres4 = df['year'].value_counts().head(5)
pres4

In [None]:
# lw  = linewidth,ms= markersize,
plt.plot(pres4.index, pres4.values, color='maroon', marker='o', lw=3, ms=10,
         linestyle='dotted', mfc='blue')
plt.title('Year wise number of songs in Spotify top 100')
plt.ylim(8, 22)
plt.grid()
plt.show()

In [None]:
df.head()

In [None]:
# length in sec
df['length'].describe()

#### Converting length of Songs to minutes

In [None]:
# 1-2, 2-3, 3-4, 4-5, 5-6 min

df['Length_Bins'] = pd.cut(df['length'], bins=[60, 120, 180, 240, 300, 360],
                           labels=['1-2min', '2-3min', '3-4min', '4-5min', '5+min'])
res10 = df['Length_Bins'].value_counts()
res10

In [None]:
plt.bar(res10.index, res10.values, color='cyan', edgecolor='black')
plt.title('Length of Songs')
plt.grid()
plt.show()

#### Histogram
Used to capture frequency Distribution

In [None]:
col

#### Depict the frequency distribution for speechiness

In [None]:
plt.hist(df['speechiness'], color='lightgreen', edgecolor='black')
plt.show()

In [None]:
df.shape

In [None]:
df[df['speechiness'] < 10].shape

#### Depict Energy vs Liveness on a ScatterChart

In [None]:
plt.scatter(df['energy'], df['liveness'], color='maroon', marker='D', s=50)
plt.title('Energy vs Liveness')
plt.xlabel('Energy')
plt.ylabel('Liveness')
plt.show()

In [None]:
# Plots -
# Bar(to compare continuous data for different categories)
# Pie Chart(percentage Distribution)
# Line Chart(To represent trend)
# Histogram(Frequency Distrition)
# Boxplot(quartile distribution)
# Scatter Chart(Correlation)

----

### End Of EDA

----