In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)


**Primary**:
- id (Id of track generated by Spotify)

**Numerical**:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
- year (Ranges from 1921 to 2020)


**Dummy**:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)

**Categorical**:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- artists (List of artists mentioned)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)

In [None]:
df = pd.read_csv('../input/spotify-dataset-19212020-160k-tracks/data.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.drop(['id'], axis = 1, inplace = True)
df.head()

# Explicit vs Non Explicit

In [None]:
sns.countplot(df['explicit'])

# Valence vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['valence'].median().plot()
plt.xlabel('year')
plt.ylabel('Median valence')



# Energy vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['energy'].median().plot()
plt.xlabel('year')
plt.ylabel('Median energy')



# Popularity vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['popularity'].median().plot()
plt.xlabel('year')
plt.ylabel('Median popularity')



# Loudness vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['loudness'].median().plot()
plt.xlabel('year')
plt.ylabel('Median loudness')



# Speechiness vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['speechiness'].median().plot()
plt.xlabel('year')
plt.ylabel('Median speechiness')



# Tempo vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['tempo'].median().plot()
plt.xlabel('year')
plt.ylabel('Median tempo')



# Liveness vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['liveness'].median().plot()
plt.xlabel('year')
plt.ylabel('Median liveness')



# Instrumentalness vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['instrumentalness'].median().plot()
plt.xlabel('year')
plt.ylabel('Median instrumentalness')



# Duration of song vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['duration_ms'].median().plot()
plt.xlabel('year')
plt.ylabel('Median duration_ms')



# Acousticness vs Year

In [None]:

plt.figure(figsize = (10,10))
df.groupby('year')['acousticness'].median().plot()
plt.xlabel('year')
plt.ylabel('Median acousticness')



# Valence w.r.t Year, explicit or not determined by colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'valence').add_legend()


# Energy w.r.t Year, explicit or not determined by colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'energy').add_legend()


# Popularity w.r.t Year, explicit in colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'popularity').add_legend()


# Duration w.r.t Years, explicit in colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'duration_ms').add_legend()


# Danceability w.r.t Year, explicit in colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'danceability').add_legend()


# Loudness w.r.t Year, explicit in colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'loudness').add_legend()


# Tempo vs Year, explicit in colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'tempo').add_legend()


# Speechiness vs Year, explicit in colors

In [None]:

sns.FacetGrid(df, hue = 'explicit', size = 10).map(plt.scatter, 'year', 'speechiness').add_legend()


# Correlation Chart

In [None]:
# To plot the correlation chart

correlations = df.corr()
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot = True)


In [None]:
df.head()

In [None]:
df.drop(['artists', 'name', 'release_date'], axis = 1, inplace = True)
df.head()

In [None]:
df.info()

# Std Scaling of data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_scaled = sc.fit_transform(df)
pd.DataFrame(df_scaled)


# Elbow to find optimal no. of clusters for K-Means

In [None]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 50):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(wcss, 'bx-')
plt.xlabel('Clusters')
plt.ylabel('Scores WCSS')


# Applying K-Means

In [None]:
kmeans = KMeans(12)
kmeans.fit(df_scaled)
labels = kmeans.labels_
labels


In [None]:

df_cluster = pd.concat([df, pd.DataFrame({'cluster': labels})], axis = 1)
df_cluster.head()


# Plot hist of each column for each cluster

In [None]:
for i in df.columns:
    plt.figure(figsize = (35, 5))
    for j in range(12):
        plt.subplot(1, 12, j+1)
        cluster = df_cluster[df_cluster['cluster'] == j]
        cluster[i].hist(bins = 20)
        plt.title('{} \nCluster {}'.format(i, j))

plt.show()


# Segregating the clusters in different dataframes

In [None]:

df0 = df_cluster[df_cluster['cluster'] == 0]
df1 = df_cluster[df_cluster['cluster'] == 1]
df2 = df_cluster[df_cluster['cluster'] == 2]
df3 = df_cluster[df_cluster['cluster'] == 3]
df4 = df_cluster[df_cluster['cluster'] == 4]
df5 = df_cluster[df_cluster['cluster'] == 5]
df6 = df_cluster[df_cluster['cluster'] == 6]
df7 = df_cluster[df_cluster['cluster'] == 7]
df8 = df_cluster[df_cluster['cluster'] == 8]
df9 = df_cluster[df_cluster['cluster'] == 9]
df10 = df_cluster[df_cluster['cluster'] == 10]
df11 = df_cluster[df_cluster['cluster'] == 11]
