In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
maharshipandya__spotify_tracks_dataset_path = kagglehub.dataset_download('maharshipandya/-spotify-tracks-dataset')

print('Data source import complete.')


### Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle

### Loading dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv('/kaggle/input/-spotify-tracks-dataset/dataset.csv', index_col=0) #index_col 0 to drop the extra index column

### Viewing the data

In [None]:
df.head()

### Checking summary of the data

In [None]:
df.info()

In [None]:
df.shape #(rows, columns)

### Looking for null values

In [None]:
df[df.isnull().any(axis=1)]

* Since there is only one row containing null values, I'll drop this row

In [None]:
df = df.dropna(axis=0)

In [None]:
df['track_genre'].nunique() # Checking how many genres are there

In [None]:
df["explicit"]=df["explicit"].astype(int) #True=1 and False=0

In [None]:
df[df.duplicated()] # checking duplicate rows

### Descriptive Statistics

In [None]:
df.describe().style.background_gradient(cmap="Accent") # Dark boxes denote very high values

### Correlation Analysis

In [None]:
# Including numerical colmumns
corr_mat = df.select_dtypes(include=["int", "float"]).corr()

# Adjusting figure visuals
plt.figure(figsize=(12, 10), facecolor='#F2EAC5', edgecolor='black')
ax = plt.axes()
ax.set_facecolor('#F2EAC5')
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', linewidths=0.5, annot_kws={"size": 10})
plt.title('Correlation Analysis')
plt.show()

#### Observations:

* There is a strong positive correlation between "loudness" and "energy".

* There is a strong negative correlation between "acousticness" and "energy".

* There is a strong negative correlation between "acousticness" and "loudness".

* There is a negative correlation between "instrumentalness" and "loudness".

* There is a weak positive correlation between "valence" (positiveness) and "danceability".

* There is a weak positive correlation between "speechiness" and "explicit".

### Selecting Numerical columns for further analysis

In [None]:
num_cols = df[df.columns[(df.dtypes == 'float64') | (df.dtypes == 'int64')]]
num_cols.shape

In [None]:
num_cols.info()

### Checking distribution of numerical columns

In [None]:
sns.set_style('darkgrid')
sns.set(rc={"axes.facecolor":"#F2EAC5","figure.facecolor":"#F2EAC5"})
num_cols.hist(figsize=(20,15), bins=30, xlabelsize=8, ylabelsize=8)
plt.tight_layout()
plt.show()

#### Observations:
* The distributions in the columns danceability, tempo, and valence are almost normal.

* The loudness column has a skew to the left, with the majority of the tracks having noise levels between -15 and -5 dB.

* Songs with low values appear in the right-skewed distributions of the speechiness, acousticness, instrumentalness, and liveness columns.

* A large number of songs have a popularity score of 0, while the other songs are mostly within the normal range.

* The duration_ms column displays a distribution that is biased to the right; the longest song is around 5 million ms (83 minutes) long, while the majority of songs last less than 500,000 ms (8 minutes).

* Songs mostly with values between 0.4 and 0.9 make up the growing distribution of the energy column, which runs from 0 to 1.

* The values in the key column are evenly distributed and range from 0 to 11.

* Most songs have a "mode" value of 1.

* The most common "time_signature" value is 5.

* The values in columns like "danceability", "energy", "speechiness", etc. range from 0 to 1.

## Business problem
Enhance user engagement and minimize churn on a streaming service by refining party music recommendations. Develop a predictive model using genre aggregation and clustering for precise music suggestions and playlists. Better recommendations aim to boost user satisfaction, increase time spent on the platform, and foster user loyalty through improved personalization and relevance.

In [None]:
# Select and scale the relevant features
features = df[['danceability','energy', 'loudness', 'valence', 'tempo']] # features selected by refering correlation matrix
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
df['track_genre'].unique()

In [None]:
# Genre to category mapping for playlist creation
genre_to_category = {
    # EDM
    'edm': 'Electronic Dance Music',
    'house': 'Electronic Dance Music',
    'electro': 'Electronic Dance Music',
    'trance': 'Electronic Dance Music',
    'techno': 'Electronic Dance Music',
    'dubstep': 'Electronic Dance Music',
    'drum-and-bass': 'Electronic Dance Music',
    'deep-house': 'Electronic Dance Music',
    'detroit-techno': 'Electronic Dance Music',
    'minimal-techno': 'Electronic Dance Music',
    'progressive-house': 'Electronic Dance Music',
    'breakbeat': 'Electronic Dance Music',

    # Rock
    'alt-rock': 'Rock',
    'rock': 'Rock',
    'indie': 'Rock',
    'indie-pop': 'Rock',
    'punk': 'Rock',
    'punk-rock': 'Rock',
    'hard-rock': 'Rock',
    'metal': 'Rock',
    'heavy-metal': 'Rock',
    'black-metal': 'Rock',
    'death-metal': 'Rock',
    'grunge': 'Rock',

    # Hip-Hop and R&B
    'hip-hop': 'Hip-Hop and R&B',
    'r-n-b': 'Hip-Hop and R&B',
    'trap': 'Hip-Hop and R&B',

    # Pop
    'pop': 'Pop',
    'electro-pop': 'Pop',
    'synth-pop': 'Pop',
    'k-pop': 'Pop',
    'pop-film': 'Pop',
    'power-pop': 'Pop',

    # Latin & Reggae/Dancehall
    'latin': 'Latin & Reggae/Dancehall',
    'reggaeton': 'Latin & Reggae/Dancehall',
    'salsa': 'Latin & Reggae/Dancehall',
    'samba': 'Latin & Reggae/Dancehall',
    'reggae': 'Latin & Reggae/Dancehall',
    'dancehall': 'Latin & Reggae/Dancehall',

    # Funk and Disco
    'funk': 'Funk and Disco',
    'disco': 'Funk and Disco',
    'groove': 'Funk and Disco',
}

# Map each track to a category
df['music_category'] = df['track_genre'].apply(lambda x: genre_to_category.get(x, 'Other'))

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
#music categories for different playlists
df['music_category'].unique()

In [None]:
#Clustering songs in different playlists

kmeans = KMeans(n_clusters=7, random_state=48)
df['cluster'] = kmeans.fit_predict(scaled_features)

In [None]:
#PCA for visualisation
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(scaled_features)

In [None]:
# Create a scatter plot
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b', 'y', 'c', 'm', 'k']  # Colors for the clusters

# Plot each cluster
for i in range(7):

    idx = df['cluster'] == i
    plt.scatter(reduced_features[idx, 0], reduced_features[idx, 1], c=colors[i], label=f'Cluster {i}')

# Adjust visuals
plt.title('Clusters visualization with PCA')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()
plt.grid(True)
plt.show()

The scatter plot visualizes clusters formed by grouping similar music genres based on audio features, reduced to two dimensions using PCA. The data is segmented into seven clusters, each representing genres with shared characteristics. The plot shows tight clustering and some overlap, indicating that while there are distinct groupings, some genres share traits across clusters. Outliers suggest unique tracks that don't closely align with others in their group

In [None]:
mood_labels = ['Other', 'Rock', 'Electronic Dance Music',
       'Latin & Reggae/Dancehall', 'Funk and Disco', 'Hip-Hop and R&B',
       'Pop']
df['music_category'] = df['cluster'].apply(lambda x: mood_labels[x])

In [None]:
df['music_category'].value_counts() # check clusters

In [None]:
df['cluster'].value_counts()

In [None]:
df.sample(5)

In [None]:
df.to_csv('spotify_with_categories.csv', index=False) #saving new file with added columns

### Classification Model creation

In [None]:
# Select features and target variable
X = num_cols
y = df['music_category']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Normalize the feature set
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.4, random_state=0)

# Define the models
models = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'SVM': SVC(random_state=0),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred, target_names=label_encoder.classes_)}\n")

# The Model will now classify new data into playlists

The classification reports detail the effectiveness of four machine learning algorithms in categorizing music into seven genres. Here's a streamlined interpretation:

Precision gauges the accuracy of positive predictions for each genre.
Recall measures the model's ability to identify all relevant cases per genre.
F1-score combines precision and recall into a single metric, crucial for imbalanced datasets.
Accuracy reflects the overall correct predictions made by the model.
Summary of Model Performances:

Random Forest: Exhibits robust classification capabilities with an overall accuracy of 97%. It consistently scores high across all genres, showing its effectiveness in distinguishing between different types of music.

SVM (Support Vector Machine): Outperforms other models with the highest accuracy at 98%. Its precision, recall, and F1-scores are uniformly high, marking it as the most reliable for this dataset.

K-Nearest Neighbors: Registers a lower accuracy of 86%, indicating it might be less suitable for this dataset or could benefit from hyperparameter optimization.

Decision Tree: Achieves an accuracy of 95%. While strong, it slightly lags behind the Random Forest and SVM, and there might be a risk of overfitting.

Considering the overall accuracy and consistency across genres, the SVM is the best-fit model for this classification task.

#### Cross Validation

In [None]:
# Cross Validation to validate if the models created are good

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the classifiers
classifiers = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'Random Forest': RandomForestClassifier(random_state=0),
    'SVM': SVC(random_state=0)
}

# Apply cross-validation and print the results
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='accuracy')
    print(f"{name} Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")

The output presents the accuracy scores of four machine learning models along with their confidence intervals:

K-Nearest Neighbors: This model has an average accuracy of 85% with a confidence interval of plus or minus 2%. This suggests that the model’s accuracy could vary slightly on different runs, reflecting moderate consistency in performance.

Decision Tree: Achieves a higher average accuracy of 95% with a very narrow confidence interval of plus or minus 1%, indicating that its performance is quite stable across different training sets.

Random Forest: Shows a slightly better average accuracy than the Decision Tree at 96%, with a confidence interval of plus or minus 1%. Like the Decision Tree, its performance is consistent, and being an ensemble method, it generally has better predictive accuracy and robustness.

SVM (Support Vector Machine): Tops the list with the highest average accuracy of 97%, but it has a confidence interval of plus or minus 2%, which is wider than the Decision Tree and Random Forest models. Despite this, it indicates that SVM is likely to perform best in predicting the correct genre of music tracks.

Considering these results, the SVM is potentially the most accurate model for this task, although it has a slightly larger variance in its accuracy. Random Forest also shows a strong and stable performance.

#### Model Comparison

In [None]:
model_accuracies = {
    'Random Forest': 0.97,
    'SVM': 0.98,
    'K-Nearest Neighbors': 0.87,
    'Decision Tree': 0.95
}

# Creating the comparison chart
plt.figure(figsize=(10, 6))
plt.bar(model_accuracies.keys(), model_accuracies.values(), color=['orange', 'green', 'red', 'purple'])
plt.title('Comparison of Model Accuracies')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.ylim(0.8, 1.0)
plt.show()

In [None]:
# Store models in pickle file

import pickle

# Assuming 'models' contains your trained models
for name, model in models.items():
    # Save each trained model as a pickle file
    with open(f"{name}_model.pkl", 'wb') as file:
        pickle.dump(model, file)

Hello,

This is the first time I'm putting out my work in a hope to learn more.
I'm pursuing data analytics.

I'll respect it if you can correct me on my mistakes and let me know what I can do to progress in this journey.

Thanks much!