In [2]:
import pandas as pd
import re
import numpy as np


In [3]:
# Load the dataset
tracks = pd.read_csv('../Data/tracks.csv')

In [3]:
column_names = list(tracks.columns)
print(column_names)


['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists', 'id_artists', 'release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']


In [4]:
tracks.dtypes

id                   object
name                 object
popularity            int64
duration_ms           int64
explicit              int64
artists              object
id_artists           object
release_date         object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
dtype: object

In [5]:
english_only = re.compile(r'^[a-zA-Z0-9 ]+$')
tracks = tracks[tracks['name'].apply(lambda x: bool(english_only.match(str(x))))]
tracks['year'] = pd.to_datetime(tracks['release_date']).dt.year
tracks = tracks[tracks['year'] >= 1960]
tracks

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
33498,4Hhv2vrOTy89HFRcjU3QOx,At Last,77,179693,0,['Etta James'],['0iOVhN3tnSvgDbcg25JoJb'],1960,0.274,0.348,...,-8.631,1,0.0293,0.547,0.013300,0.3340,0.328,87.430,3,1960
33499,0zGLlXbHlrAyBN1x6sY0rb,A Sunday Kind Of Love,71,196133,0,['Etta James'],['0iOVhN3tnSvgDbcg25JoJb'],1960,0.421,0.285,...,-9.430,0,0.0293,0.829,0.000002,0.2330,0.402,85.861,4,1960
33501,4yjz1aazw6R8ZURpGbCAkp,Bring It On Home To Me,65,165280,0,['Sam Cooke'],['6hnWRPzGGKiapVX1UCdEAC'],1960-01-01,0.482,0.569,...,-5.897,1,0.0328,0.617,0.000000,0.1550,0.609,70.843,4,1960
33503,0BFEyqJ9DJXS7gKg0Kj46R,You Send Me,64,165560,0,['Sam Cooke'],['6hnWRPzGGKiapVX1UCdEAC'],1960-01-01,0.553,0.291,...,-10.426,0,0.0301,0.878,0.000000,0.1290,0.407,96.217,4,1960
33504,3oAWTk92mZBxKBOKf8mR5v,Summertime Blues,64,119360,0,['Eddie Cochran'],['1p0t3JtUTayV2wb1RGN9mO'],1960-05-01,0.714,0.886,...,-8.629,0,0.0554,0.116,0.184000,0.1800,0.954,156.351,4,1960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586662,4Zp3rm12p5PiHToYJflmyy,Meet Again,57,273587,0,['KIMSEJEONG'],['1lFLniFTaPjYCtQZvDXpqu'],2020-12-20,0.476,0.440,...,-8.508,1,0.0488,0.679,0.000000,0.0926,0.241,135.814,4,2020
586666,1ZwZsVZUiyFwIHMNpI3ERt,Skyscraper,4,106002,0,['Emilie Chin'],['4USdOnfLczwUglA3TrdHs2'],2020-02-08,0.626,0.530,...,-13.117,0,0.0284,0.113,0.856000,0.1040,0.215,120.113,4,2020
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,0.765,0.663,...,-5.223,1,0.0652,0.141,0.000297,0.0924,0.686,150.091,4,2020
586670,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021-03-05,0.696,0.615,...,-6.212,1,0.0345,0.206,0.000003,0.3050,0.438,90.029,4,2021


In [6]:
# List of Colums to convert
col_to_convert= ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo', 'time_signature']

# iterate through columns and change data types
for col in col_to_convert:
    tracks[col] = tracks[col].astype("int64")
# print data types to confirm changes
print(tracks.dtypes)

id                  object
name                object
popularity           int64
duration_ms          int64
explicit             int64
artists             object
id_artists          object
release_date        object
danceability         int64
energy               int64
key                  int64
loudness             int64
mode                 int64
speechiness          int64
acousticness         int64
instrumentalness     int64
liveness             int64
valence              int64
tempo                int64
time_signature       int64
year                 int64
dtype: object


In [7]:
tracks = tracks.drop(['id','id_artists'], axis=1)
tracks = tracks.reset_index(drop=True)

In [8]:
# Checking columns & data types in DataFrame
tracks.dtypes

name                object
popularity           int64
duration_ms          int64
explicit             int64
artists             object
release_date        object
danceability         int64
energy               int64
key                  int64
loudness             int64
mode                 int64
speechiness          int64
acousticness         int64
instrumentalness     int64
liveness             int64
valence              int64
tempo                int64
time_signature       int64
year                 int64
dtype: object

In [9]:
# convert date column to datetime type
tracks['release_date'] = pd.to_datetime(tracks['release_date'])

# extract year component from date column
tracks['release_date'] = tracks['release_date'].dt.year

# print updated DataFrame
tracks

Unnamed: 0,name,popularity,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,At Last,77,179693,0,['Etta James'],1960,0,0,5,-8,1,0,0,0,0,0,87,3,1960
1,A Sunday Kind Of Love,71,196133,0,['Etta James'],1960,0,0,2,-9,0,0,0,0,0,0,85,4,1960
2,Bring It On Home To Me,65,165280,0,['Sam Cooke'],1960,0,0,0,-5,1,0,0,0,0,0,70,4,1960
3,You Send Me,64,165560,0,['Sam Cooke'],1960,0,0,4,-10,0,0,0,0,0,0,96,4,1960
4,Summertime Blues,64,119360,0,['Eddie Cochran'],1960,0,0,11,-8,0,0,0,0,0,0,156,4,1960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259867,Meet Again,57,273587,0,['KIMSEJEONG'],2020,0,0,3,-8,1,0,0,0,0,0,135,4,2020
259868,Skyscraper,4,106002,0,['Emilie Chin'],2020,0,0,5,-13,0,0,0,0,0,0,120,4,2020
259869,blind,72,153293,0,['ROLE MODEL'],2020,0,0,0,-5,1,0,0,0,0,0,150,4,2020
259870,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']",2021,0,0,10,-6,1,0,0,0,0,0,90,4,2021


In [10]:
# Drop rows with null values in column 'popularity'
tracks.dropna(subset=['explicit']).shape

# Print the resulting dataframe


(259872, 19)

In [11]:
# #export to new csv
# tracks.to_csv('sd_tracks_cleaned.csv', index=True)


In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# select features and split data into training and testing sets
features = ['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
X = tracks[features]

y = tracks['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the KNN model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

# evaluate the model on the test set
y_pred = knn.predict(X_test)
test_score = knn.score(X_test, y_test)
r2 = r2_score(y_test, y_pred)

print('Test score:', test_score)
print('R-squared value:', r2)



Test score: 0.9997407080473247
R-squared value: 0.9997407080473247


In [13]:
# import pickle

# # save the trained KNN model in pickle format
# with open('sd_model.pkl', 'wb') as f:
#     pickle.dump(knn, f)

In [14]:
# # load the trained KNN model from pickle file
# with open('knn_model.pkl', 'rb') as f:
#     knn = pickle.load(f)

# # define the recommendation function
# def recommend_songs(song_name, k=10):
#     # find the index of the input song in the X matrix
#     idx = tracks[tracks['name'] == song_name].index[0]
#     # find the k nearest neighbor songs using the trained KNN model
#     _, indices = knn.kneighbors(X.iloc[idx].values.reshape(1, -1), n_neighbors=k+1)
#     # return the names of the k nearest neighbor songs
#     # Print the recommended songs
#     print('Songs similar to {}:'.format(song_name))
#     for index in indices[0]:
#          print('- {}'.format(tracks.loc[index, 'name']))


# recommend_songs('Despacito')

In [15]:
# from flask import Flask, render_template, request

# app = Flask(__name__)

# @app.route('/')
# def home():
#     return render_template('index.html')

# @app.route('/recommend', methods=['POST'])
# def recommend():
#     # Get the input song name from the web form
#     song_name = request.form['song_name']
#     # Call the recommend_songs function to get the recommended songs
#     recommended_songs = recommend_songs(song_name)
#     # Render the results template with the recommended songs
#     return render_template('results.html', song_name=song_name, recommended_songs=recommended_songs)

# if __name__ == '__main__':
#     app.run(debug=True)