# Perform exploratory analysis of music data and create a model song recommendations

https://www.kaggle.com/datasets/travisdaws/spotify-tracks-acoustic-features/data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Data Analysis

In [2]:
df = pd.read_csv('tracks.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,uris,names,artist_names,artist_uris,artist_pop,artist_genres,albums,track_pop,danceability,...,liveness,valences,tempos,types,ids,track_hrefs,analysis_urls,durations_ms,time_signatures,playlist_name
0,0,4xhsWYTOGcal8zt0J161CU,Lovin On Me,Jack Harlow,2LIk90788K0zvyj2JJVwkJ,81,"['deep underground hip hop', 'kentucky hip hop...",Lovin On Me,88,0.943,...,0.0937,0.606,104.983,audio_features,4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4,Hot Hits Australia
1,1,0cVyQfDyRnMJ0V3rjjdlU3,Lil Boo Thang,Paul Russell,4zoRNhOhsGX3w8yBAnFSQ8,64,['indie r&b'],Lil Boo Thang,86,0.85,...,0.32,0.915,114.481,audio_features,0cVyQfDyRnMJ0V3rjjdlU3,https://api.spotify.com/v1/tracks/0cVyQfDyRnMJ...,https://api.spotify.com/v1/audio-analysis/0cVy...,114234,4,Hot Hits Australia
2,2,31MNHKE86sEXzIglbGQ6mu,Got Me Started,Troye Sivan,3WGpXCj9YhhfX11TToZcXP,78,"['australian pop', 'pop', 'viral pop']",Something To Give Each Other,83,0.777,...,0.318,0.582,127.002,audio_features,31MNHKE86sEXzIglbGQ6mu,https://api.spotify.com/v1/tracks/31MNHKE86sEX...,https://api.spotify.com/v1/audio-analysis/31MN...,198448,4,Hot Hits Australia
3,3,4OMJGnvZfDvsePyCwRGO7X,Houdini,Dua Lipa,6M2wZ9GZgrQXHCFfjv46we,85,"['dance pop', 'pop', 'uk pop']",Houdini,90,0.744,...,0.0947,0.866,116.985,audio_features,4OMJGnvZfDvsePyCwRGO7X,https://api.spotify.com/v1/tracks/4OMJGnvZfDvs...,https://api.spotify.com/v1/audio-analysis/4OMJ...,185918,4,Hot Hits Australia
4,4,5aIVCx5tnk0ntmdiinnYvw,Water,Tyla,3SozjO3Lat463tQICI9LcE,73,[],Water,96,0.673,...,0.137,0.519,117.187,audio_features,5aIVCx5tnk0ntmdiinnYvw,https://api.spotify.com/v1/tracks/5aIVCx5tnk0n...,https://api.spotify.com/v1/audio-analysis/5aIV...,200256,4,Hot Hits Australia


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7994 entries, 0 to 7993
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        7994 non-null   int64  
 1   uris              7994 non-null   object 
 2   names             7994 non-null   object 
 3   artist_names      7994 non-null   object 
 4   artist_uris       7994 non-null   object 
 5   artist_pop        7994 non-null   int64  
 6   artist_genres     7994 non-null   object 
 7   albums            7994 non-null   object 
 8   track_pop         7994 non-null   int64  
 9   danceability      7994 non-null   float64
 10  energy            7994 non-null   float64
 11  keys              7994 non-null   int64  
 12  loudness          7994 non-null   float64
 13  modes             7994 non-null   int64  
 14  speechiness       7994 non-null   float64
 15  acousticness      7994 non-null   float64
 16  instrumentalness  7994 non-null   float64


## Dropping unneeded features

In [4]:
df_need = df.drop(['Unnamed: 0', 'uris', 'artist_uris', 'analysis_urls', 'track_hrefs', 'ids','keys','modes'], axis = 1)
df_need.columns

Index(['names', 'artist_names', 'artist_pop', 'artist_genres', 'albums',
       'track_pop', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valences', 'tempos',
       'types', 'durations_ms', 'time_signatures', 'playlist_name'],
      dtype='object')

In [5]:
df_need.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7994 entries, 0 to 7993
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   names             7994 non-null   object 
 1   artist_names      7994 non-null   object 
 2   artist_pop        7994 non-null   int64  
 3   artist_genres     7994 non-null   object 
 4   albums            7994 non-null   object 
 5   track_pop         7994 non-null   int64  
 6   danceability      7994 non-null   float64
 7   energy            7994 non-null   float64
 8   loudness          7994 non-null   float64
 9   speechiness       7994 non-null   float64
 10  acousticness      7994 non-null   float64
 11  instrumentalness  7994 non-null   float64
 12  liveness          7994 non-null   float64
 13  valences          7994 non-null   float64
 14  tempos            7994 non-null   float64
 15  types             7994 non-null   object 
 16  durations_ms      7994 non-null   int64  


## Making input to recommend songs

In [6]:
df_need['song'] = df_need['names'] + ' - ' + df_need['artist_names']
df_need['song']

Unnamed: 0,song
0,Lovin On Me - Jack Harlow
1,Lil Boo Thang - Paul Russell
2,Got Me Started - Troye Sivan
3,Houdini - Dua Lipa
4,Water - Tyla
...,...
7989,Once You Love Someone - Tep No
7990,Goodbye - SAINT
7991,I Think I’m Over You - Win and Woo
7992,Lonely Dulcimer - Dusky


## Dropping unneeded columns

In [7]:
df_need.drop(['names', 'artist_names'], axis = 1, inplace=True)
df_need.columns

Index(['artist_pop', 'artist_genres', 'albums', 'track_pop', 'danceability',
       'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valences', 'tempos', 'types', 'durations_ms',
       'time_signatures', 'playlist_name', 'song'],
      dtype='object')

## Checking Artist Genres

In [8]:
df_need['artist_genres']

Unnamed: 0,artist_genres
0,"['deep underground hip hop', 'kentucky hip hop..."
1,['indie r&b']
2,"['australian pop', 'pop', 'viral pop']"
3,"['dance pop', 'pop', 'uk pop']"
4,[]
...,...
7989,"['deep tropical house', 'indie poptimism', 'tr..."
7990,[]
7991,"['deep tropical house', 'pop edm']"
7992,"['deep house', 'future garage', 'house']"


In [9]:
df_need['artist_genres'] = df_need['artist_genres'].str.replace(r"[\[\]']", '', regex=True)
df_need['artist_genres']

Unnamed: 0,artist_genres
0,"deep underground hip hop, kentucky hip hop, po..."
1,indie r&b
2,"australian pop, pop, viral pop"
3,"dance pop, pop, uk pop"
4,
...,...
7989,"deep tropical house, indie poptimism, tropical..."
7990,
7991,"deep tropical house, pop edm"
7992,"deep house, future garage, house"


## Checking how many I have different genres

In [10]:
lista = set(sum(df_need['artist_genres'].str.split(','), []))
lista = list(lista)

len(lista)

1411

## Checking DataFrame after some changes

In [11]:
df_need.head()

Unnamed: 0,artist_pop,artist_genres,albums,track_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valences,tempos,types,durations_ms,time_signatures,playlist_name,song
0,81,"deep underground hip hop, kentucky hip hop, po...",Lovin On Me,88,0.943,0.558,-4.911,0.0568,0.0026,2e-06,0.0937,0.606,104.983,audio_features,138411,4,Hot Hits Australia,Lovin On Me - Jack Harlow
1,64,indie r&b,Lil Boo Thang,86,0.85,0.699,-3.292,0.0776,0.152,0.0,0.32,0.915,114.481,audio_features,114234,4,Hot Hits Australia,Lil Boo Thang - Paul Russell
2,78,"australian pop, pop, viral pop",Something To Give Each Other,83,0.777,0.686,-6.498,0.0308,0.223,0.00148,0.318,0.582,127.002,audio_features,198448,4,Hot Hits Australia,Got Me Started - Troye Sivan
3,85,"dance pop, pop, uk pop",Houdini,90,0.744,0.789,-4.876,0.059,0.0036,0.00144,0.0947,0.866,116.985,audio_features,185918,4,Hot Hits Australia,Houdini - Dua Lipa
4,73,,Water,96,0.673,0.722,-3.495,0.0755,0.0856,0.0,0.137,0.519,117.187,audio_features,200256,4,Hot Hits Australia,Water - Tyla


In [12]:
df_need['playlist_name'].value_counts()

Unnamed: 0_level_0,count
playlist_name,Unnamed: 1_level_1
Sleep,302
Peaceful Piano,276
Motivate,265
New Music Friday,216
Peaceful Meditation,200
...,...
Happy Pop Hits,11
Hot Hits USA,11
Easy 90s,11
Mega Hit Mix,7


In [13]:
df_need['types'].value_counts()

Unnamed: 0_level_0,count
types,Unnamed: 1_level_1
audio_features,7994


## Dropping unneeded column

In [14]:
df_need.drop('types', axis=1, inplace=True)

## Encoding Artis genres with Word2Vec

In [15]:
from gensim.models import Word2Vec
import numpy as np
import string

def clean_text(text):
    return ''.join([char for char in text if char not in string.punctuation])

df_need['artist_genres'] = df_need['artist_genres'].apply(clean_text).apply(lambda x: x.split())
sentences = df_need['artist_genres'].tolist()

model = Word2Vec(sentences, vector_size=10, window=2, min_count=1, workers=4)

pop_embedding = model.wv['pop']
print(np.mean(pop_embedding))

0.046105437


In [16]:
import numpy as np
from gensim.models import Word2Vec
import string

def calculate_mean_embedding(genre):
    if genre:
        embeddings = [model.wv[word] for word in genre if word in model.wv]

        if embeddings:
            mean_embedding = np.median(embeddings)
            return mean_embedding
    return 0

df_need['artist_genres'] = df_need['artist_genres'].apply(calculate_mean_embedding)

In [17]:
df_need['artist_genres'].value_counts()

Unnamed: 0_level_0,count
artist_genres,Unnamed: 1_level_1
0.000000,1317
-0.003739,313
0.042590,142
-0.099873,122
-0.092197,120
...,...
0.220510,1
0.052730,1
0.115228,1
0.175161,1


Splitting Data

In [18]:
#Model is not trained in the traditional sense. The recommendation score is directly calculated based on available data. That's why I didn't divide the collection

X = df_need.drop('song', axis=1)
y = df_need['song']

## Encoding rest of the dataframe

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [20]:
numeric_columns = X.select_dtypes(include=['number']).columns.tolist()
non_numeric_columns = X.select_dtypes(exclude=['number']).columns.tolist()
print(numeric_columns)
print(non_numeric_columns)

['artist_pop', 'artist_genres', 'track_pop', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valences', 'tempos', 'durations_ms', 'time_signatures']
['albums', 'playlist_name']


In [21]:
frequency = X['artist_genres'].value_counts(normalize=True)
X['artist_genres'] = X['artist_genres'].map(frequency)

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd


# Kodowanie częstotliwościowe
class FrequencyEncoder:
    def fit(self, X, y=None):
        self.frequency_maps = {}
        for column in X.columns:
            frequency = X[column].value_counts(normalize=True)
            self.frequency_maps[column] = frequency
        return self

    def transform(self, X):
        X_encoded = X.copy()
        for column in X.columns:
            X_encoded[column] = X_encoded[column].map(self.frequency_maps[column])
        return X_encoded

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def get_params(self, deep=True):
        return {}


steps = [
    ('frequency_encoder', FrequencyEncoder()),
    ('scaler', StandardScaler())
]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', FrequencyEncoder(), non_numeric_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


pipeline.fit(X)

X_transformed = pipeline.transform(X)

print(X_transformed)

[[ 1.25093820e-04  6.37978484e-03  1.20018438e+00 ... -4.46227993e-01
  -1.04101172e+00  2.04009151e-01]
 [ 1.25093820e-04  6.37978484e-03  2.66312125e-01 ... -1.11118814e-01
  -1.38724086e+00  2.04009151e-01]
 [ 5.00375281e-04  6.37978484e-03  1.03538339e+00 ...  3.30648089e-01
  -1.81245862e-01  2.04009151e-01]
 ...
 [ 1.25093820e-04  1.90142607e-02 -1.10702942e+00 ...  1.89696186e-01
  -2.49827215e-01  2.04009151e-01]
 [ 1.25093820e-04  1.90142607e-02 -8.32361110e-01 ...  5.06035300e-01
  -2.48638605e-01  2.04009151e-01]
 [ 2.50187641e-04  1.90142607e-02 -7.22493787e-01 ...  8.52259522e-02
  -3.59508690e-01  2.04009151e-01]]


## Using Content-Based Filtering with recommendation problem

## KNN

In [23]:
from sklearn.neighbors import NearestNeighbors

In [24]:
def recommend_songs_KNN(song_index=0):
  result = []
  knn = NearestNeighbors(n_neighbors=3, metric='euclidean')
  knn.fit(X_transformed)

  distances, indices = knn.kneighbors([X_transformed[song_index]])

  for idx in indices[0][1:]:
      result.append(idx)
  return result

In [25]:
print("Rekomendowane piosenki:")
result = recommend_songs_KNN()
print(y.iloc[result[0]])
print(y.iloc[result[1]])

Rekomendowane piosenki:
IT'S GETTING HOT - NLE Choppa
Lalkara - Diljit Dosanjh


In [26]:
df_need[df_need['song'] == "IT'S GETTING HOT - NLE Choppa"]

Unnamed: 0,artist_pop,artist_genres,albums,track_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valences,tempos,durations_ms,time_signatures,playlist_name,song
2268,77,-0.12722,IT'S GETTING HOT,78,0.908,0.757,-3.889,0.0606,0.015,0.0,0.0673,0.577,106.031,134425,4,Viral Hits,IT'S GETTING HOT - NLE Choppa


In [27]:
df_need[df_need['song'] == 'Lalkara - Diljit Dosanjh']

Unnamed: 0,artist_pop,artist_genres,albums,track_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valences,tempos,durations_ms,time_signatures,playlist_name,song
5928,74,0.081753,Ghost,78,0.85,0.557,-4.503,0.0431,0.0365,2e-06,0.131,0.468,96.99,160500,4,Hot Hits Punjabi,Lalkara - Diljit Dosanjh


In [28]:
df_need.iloc[0]

Unnamed: 0,0
artist_pop,81
artist_genres,-0.011778
albums,Lovin On Me
track_pop,88
danceability,0.943
energy,0.558
loudness,-4.911
speechiness,0.0568
acousticness,0.0026
instrumentalness,0.000002


## Cosine Similarity

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_songs_Cosine_Similarity(song_index=0):
  result = []
  similarity_matrix = cosine_similarity(X_transformed)

  similarities = similarity_matrix[song_index]

  similar_song_indices = similarities.argsort()[::-1][1:3]

  for idx in similar_song_indices:
      result.append(idx)
  return result

## Cosine Similarity + KNN

In [30]:
def recommend_songs_KNN_and_Cosine_Similarity(song_index=0):
  result = []
  similarity_matrix = cosine_similarity(X_transformed)
  similarities = similarity_matrix[song_index]
  similar_song_indices = similarities.argsort()[::-1][1:3]

  distance_matrix = 1 - similarity_matrix
  distance_matrix[distance_matrix < 0] = 0

  knn = NearestNeighbors(n_neighbors=3, metric='precomputed')
  knn.fit(distance_matrix)

  distances, indices = knn.kneighbors([distance_matrix[song_index]])

  for idx in indices[0][1:]:
      result.append(idx)
  return result

## Hypertuning KNN

In [31]:
for k in range(1, 11):
    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(X_transformed)
    distances, indices = knn.kneighbors([X_transformed[0]])
    print(f"Neihbour Count: {k}, Recomendations: {[y.iloc[idx] for idx in indices[0][1:]]}")

Neihbour Count: 1, Recomendations: []
Neihbour Count: 2, Recomendations: ["IT'S GETTING HOT - NLE Choppa"]
Neihbour Count: 3, Recomendations: ["IT'S GETTING HOT - NLE Choppa", 'Lalkara - Diljit Dosanjh']
Neihbour Count: 4, Recomendations: ["IT'S GETTING HOT - NLE Choppa", 'Lalkara - Diljit Dosanjh', 'You & Me - JENNIE']
Neihbour Count: 5, Recomendations: ["IT'S GETTING HOT - NLE Choppa", 'Lalkara - Diljit Dosanjh', 'You & Me - JENNIE', 'Numb - Marshmello']
Neihbour Count: 6, Recomendations: ["IT'S GETTING HOT - NLE Choppa", 'Lalkara - Diljit Dosanjh', 'You & Me - JENNIE', 'Numb - Marshmello', 'AMARGURA - KAROL G']
Neihbour Count: 7, Recomendations: ["IT'S GETTING HOT - NLE Choppa", 'Lalkara - Diljit Dosanjh', 'You & Me - JENNIE', 'Numb - Marshmello', 'AMARGURA - KAROL G', 'ANDO - Jere Klein']
Neihbour Count: 8, Recomendations: ["IT'S GETTING HOT - NLE Choppa", 'Lalkara - Diljit Dosanjh', 'You & Me - JENNIE', 'Numb - Marshmello', 'AMARGURA - KAROL G', 'ANDO - Jere Klein', 'Some - Steve 

### Chosing n_neighbors = 3, because I want two recommendations

In [32]:
print("Rekomendowane piosenki:")
result = recommend_songs_KNN()
print(y.iloc[result[0]])
print(y.iloc[result[1]])

Rekomendowane piosenki:
IT'S GETTING HOT - NLE Choppa
Lalkara - Diljit Dosanjh


In [33]:
print("Rekomendowane piosenki:")
result = recommend_songs_Cosine_Similarity()
print(y.iloc[result[0]])
print(y.iloc[result[1]])

Rekomendowane piosenki:
You & Me - JENNIE
Lalkara - Diljit Dosanjh


In [34]:
print("Rekomendowane piosenki:")
result = recommend_songs_KNN_and_Cosine_Similarity()
print(y.iloc[result[0]])
print(y.iloc[result[1]])

Rekomendowane piosenki:
You & Me - JENNIE
Lalkara - Diljit Dosanjh


## Recommendation System Metrics

In [35]:
def cosine_similarity_recomendation(df, result, song_index):
  similarities  = []
  for i in result:
    row1 = df.iloc[song_index].values
    row2 = df.iloc[i].values
    similarity = cosine_similarity([row1], [row2])[0][0]
    similarities.append(similarity)
  return similarities

In [36]:
X_transformed_df = pd.DataFrame(X_transformed)

print("Podobieństwo kosinusowe KNN")
result1 = recommend_songs_KNN()
cos1 = cosine_similarity_recomendation(X_transformed_df, result1, 0)
print(f"{cos1}\n")

print("Podobieństwo kosinusowe Cosine Similarity")
result2 = recommend_songs_Cosine_Similarity()
cos2 = cosine_similarity_recomendation(X_transformed_df, result2, 0)
print(f"{cos2}\n")

print("Podobieństwo kosinusowe KNN_and_Cosine_Similarity")
result3 = recommend_songs_KNN_and_Cosine_Similarity()
cos3 = cosine_similarity_recomendation(X_transformed_df, result3, 0)
print(f"{cos3}\n")

Podobieństwo kosinusowe KNN
[0.951299068728101, 0.9574949129110895]

Podobieństwo kosinusowe Cosine Similarity
[0.9680017413215174, 0.9574949129110895]

Podobieństwo kosinusowe KNN_and_Cosine_Similarity
[0.9680017413215174, 0.9574949129110895]



In [37]:
print("Podobieństwo kosinusowe KNN")
result1 = recommend_songs_KNN()
cos1 = cosine_similarity_recomendation(X_transformed_df, result1, 3)
print(f"{cos1}\n")

print("Podobieństwo kosinusowe Cosine Similarity")
result2 = recommend_songs_Cosine_Similarity()
cos2 = cosine_similarity_recomendation(X_transformed_df, result2, 3)
print(f"{cos2}\n")

print("Podobieństwo kosinusowe KNN_and_Cosine_Similarity")
result3 = recommend_songs_KNN_and_Cosine_Similarity()
cos3 = cosine_similarity_recomendation(X_transformed_df, result3, 3)
print(f"{cos3}\n")

Podobieństwo kosinusowe KNN
[0.8281477536924287, 0.7160812075534444]

Podobieństwo kosinusowe Cosine Similarity
[0.8782489111383804, 0.7160812075534444]

Podobieństwo kosinusowe KNN_and_Cosine_Similarity
[0.8782489111383804, 0.7160812075534444]



In [38]:
def choosing_way_to_recommending(n):
    knn_score, cosine_score, combined_score = 0, 0, 0

    for i in range(n):
        result1 = recommend_songs_KNN(i)
        cos1 = cosine_similarity_recomendation(X_transformed_df, result1, i)

        result2 = recommend_songs_Cosine_Similarity(i)
        cos2 = cosine_similarity_recomendation(X_transformed_df, result2, i)

        result3 = recommend_songs_KNN_and_Cosine_Similarity(i)
        cos3 = cosine_similarity_recomendation(X_transformed_df, result3, i)

        score1 = sum(cos1)
        score2 = sum(cos2)
        score3 = sum(cos3)

        if score1 >= score2 and score1 >= score3:
            knn_score += 1
        elif score2 >= score1 and score2 >= score3:
            cosine_score += 1
        else:
            combined_score += 1

    if knn_score > cosine_score and knn_score > combined_score:
        return "recommend_songs_KNN"
    elif cosine_score > knn_score and cosine_score > combined_score:
        return "recommend_songs_Cosine_Similarity"
    else:
        return "recommend_songs_KNN_and_Cosine_Similarity"

In [39]:
choosing_way_to_recommending(10)

'recommend_songs_KNN'

In [40]:
choosing_way_to_recommending(100)

'recommend_songs_KNN'

In [41]:
choosing_way_to_recommending(1000)

'recommend_songs_KNN'

# The most effective approach for making recommendations in my project is through the use of KNN.