### Importing the modules.

In [None]:
import pandas as pd
import numpy as np

### Loading and Parsing the CSV dataset file.

In [None]:
df = pd.read_csv('./tracks_features.csv')
print("Length of the Dataset -", len(df))

### Some information about the data.

In [None]:
df.info()

In [None]:
df.head(2)

In [None]:
df['name'] = df['name'].str.replace(' ', '')
df['album'] = df['album'].str.replace(' ', '')

In [None]:
df['name']

In [None]:
df['album']

In [None]:
df['track'] = df['name'] + '-' + df['album']
df.head(5)

In [None]:
df['artist_ids'] = df['artist_ids'].str.strip("[]").str.strip("'")
df['artist_ids']

In [None]:
df['artists'] = df['artists'].str.strip("[]").str.strip("'")
df['artists'] = df['artists'].str.replace(' ', '')
df['artists']

In [None]:
df['artists'].nunique()

In [None]:
df.drop(['name','album', 'track_number', 'disc_number', 'explicit', 'key', 'mode', 'time_signature'], axis='columns', inplace=True)
df.info()

In [None]:
df.head(5)

### Normalisation

In [None]:
import scipy.stats as stats

df['tempo'] = stats.zscore(df['tempo'])
# df['tempo']

df['loudness'] = stats.zscore(df['loudness'])
# df['loudness']

df.rename(columns={'duration_ms' : 'duration'}, inplace=True)
df['duration'] = stats.zscore(df['duration'])
# df['duration']

df.head(5)

### Cosine Similarity

In [None]:
# cosine similarity

### Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(df['danceability'])

In [None]:
sns.histplot(df['energy'])

In [None]:
sns.histplot(df['loudness'])

In [None]:
sns.heatmap(data=df[['speechiness']])

In [None]:
sns.histplot(x=df['acousticness'])

In [None]:
sns.heatmap(data=df[['instrumentalness']])

In [None]:
sns.heatmap(data=df[['liveness']])

In [None]:
sns.histplot(df['valence'])

In [None]:
sns.histplot(df['tempo'])

### Scaling 'tempo'

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df['tempo'] = scaler.fit_transform(df[['tempo']])
print(max(df["tempo"]))
print(min(df["tempo"]))

In [None]:
sns.histplot(df['tempo'])

In [None]:
sns.displot(data=df['duration'], kde=True)
plt.xlim(-10,10)
plt.show()
print(max(df["duration"]))
print(min(df["duration"]))

### Scaling 'duration'

In [None]:
df['duration'] = scaler.fit_transform(df[['duration']])
print(max(df["duration"]))
print(min(df["duration"]))

In [None]:
sns.displot(data=df['duration'], kde=True)
plt.xlim(-1,1)
plt.show()
print(max(df["duration"]))
print(min(df["duration"]))

### Clustering

In [None]:
from sklearn.cluster import KMeans

properties = df[['danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration']]
kmeans = KMeans(n_clusters=4, max_iter=1000,n_init=10)
kmeans.fit(properties)

labels = kmeans.predict(properties)
df['cluster']=labels

print(df['cluster'].value_counts())


### Making a random sample of the data as it is too big

In [None]:
df_sample = df.sample(frac=0.0001, random_state=42)

### Visualising the formed Clusters

In [None]:
sns.pairplot(df_sample, vars=['danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration'], hue='cluster')
# sns.pairplot(df, vars=['danceability','energy','loudness'], hue='cluster')