# Term Project Machine Learning

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

### Load Data

In [2]:
#load data
df = pd.read_csv('../data/spotify-2023.csv',encoding='latin-1')
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


### Clean Data

In [3]:
df.dtypes

track_name              object
artist(s)_name          object
artist_count             int64
released_year            int64
released_month           int64
released_day             int64
in_spotify_playlists     int64
in_spotify_charts        int64
streams                 object
in_apple_playlists       int64
in_apple_charts          int64
in_deezer_playlists     object
in_deezer_charts         int64
in_shazam_charts        object
bpm                      int64
key                     object
mode                    object
danceability_%           int64
valence_%                int64
energy_%                 int64
acousticness_%           int64
instrumentalness_%       int64
liveness_%               int64
speechiness_%            int64
dtype: object

In [4]:
#Coverting streams and in_deezer_playlists into float
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')

#replace commas in in_shazam_charts so it's possible to convert it into int
df['in_shazam_charts'] = df['in_shazam_charts'].str.replace(',', '', regex=True).astype(float)

In [5]:
# missing values
missing_values = df.isnull().sum()
print(missing_values)

track_name               0
artist(s)_name           0
artist_count             0
released_year            0
released_month           0
released_day             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  1
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists     79
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
dtype: int64


In [6]:
# delete rows with missing values for 'key' and 'streams' to maintain the integrity and quality of dataset
# because we think these values are very important for the prediction
df = df.dropna(subset=['key'])
df = df.dropna(subset=['streams'])

In [7]:
missing_values = df.isnull().sum()
print(missing_values)

track_name               0
artist(s)_name           0
artist_count             0
released_year            0
released_month           0
released_day             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  0
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists     75
in_deezer_charts         0
in_shazam_charts        41
bpm                      0
key                      0
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
dtype: int64


In [8]:
df = df.fillna(-1)
missing_values = df.isnull().sum()
print(missing_values)

track_name              0
artist(s)_name          0
artist_count            0
released_year           0
released_month          0
released_day            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
in_deezer_playlists     0
in_deezer_charts        0
in_shazam_charts        0
bpm                     0
key                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
dtype: int64


In [9]:
df['streams'] = df['streams'].astype('int64')
df['in_deezer_playlists'] = df['in_deezer_playlists'].astype('int64')
df['in_shazam_charts'] = df['in_shazam_charts'].astype('int64')

In [10]:
# Combine 'released_year', 'released_month', and 'released_day' into a single datetime column
df['release_date'] = pd.to_datetime(df['released_year'].astype(str) + '-' + 
                                      df['released_month'].astype(str) + '-' + 
                                      df['released_day'].astype(str), errors='coerce')

df = df.drop('released_year', axis=1)
df = df.drop('released_month', axis=1)
df = df.drop('released_day', axis=1)

# Check the new 'release_date' column
print(df['release_date'].head())

0   2023-07-14
1   2023-03-23
2   2023-06-30
3   2019-08-23
4   2023-05-18
Name: release_date, dtype: datetime64[ns]


In [11]:
# One-Hot Encoding for 'mode', keep the original 'mode' column untouched
mode_encoded = pd.get_dummies(df['mode'], prefix='mode')

# concat one-hot encoded df to main df
df = pd.concat([df, mode_encoded], axis=1)

In [12]:
# new column for encoded 'key'
df['key_encoded'] = LabelEncoder().fit_transform(df['key'].astype(str))

# check encoded values
print(df[['mode', *mode_encoded.columns, 'key', 'key_encoded']].head(10))

    mode  mode_Major  mode_Minor key  key_encoded
0  Major        True       False   B            2
1  Major        True       False  C#            3
2  Major        True       False   F            7
3  Major        True       False   A            0
4  Minor       False        True   A            0
5  Major        True       False  C#            3
6  Minor       False        True   F            7
7  Major        True       False   F            7
8  Minor       False        True  C#            3
9  Minor       False        True   D            4


In [13]:
df['danceability_%'] = df['danceability_%'] / 100.0
df['valence_%'] = df['valence_%'] / 100.0
df['energy_%'] = df['energy_%'] / 100.0
df['acousticness_%'] = df['acousticness_%'] / 100.0
df['instrumentalness_%'] = df['instrumentalness_%'] / 100.0
df['liveness_%'] = df['liveness_%'] / 100.0
df['speechiness_%'] = df['speechiness_%'] / 100.0

In [14]:
df.dtypes

track_name                      object
artist(s)_name                  object
artist_count                     int64
in_spotify_playlists             int64
in_spotify_charts                int64
streams                          int64
in_apple_playlists               int64
in_apple_charts                  int64
in_deezer_playlists              int64
in_deezer_charts                 int64
in_shazam_charts                 int64
bpm                              int64
key                             object
mode                            object
danceability_%                 float64
valence_%                      float64
energy_%                       float64
acousticness_%                 float64
instrumentalness_%             float64
liveness_%                     float64
speechiness_%                  float64
release_date            datetime64[ns]
mode_Major                        bool
mode_Minor                        bool
key_encoded                      int32
dtype: object

In [15]:
df.describe()

Unnamed: 0,artist_count,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,release_date,key_encoded
count,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857,857.0
mean,1.551925,5204.997666,11.95916,513355400.0,67.383897,51.044341,96.751459,2.619603,54.81797,122.842474,0.67273,0.512194,0.643396,0.266441,0.016114,0.18175,0.103967,2018-10-01 15:14:04.340723200,5.158693
min,1.0,31.0,0.0,2762.0,0.0,0.0,-1.0,0.0,-1.0,65.0,0.23,0.04,0.14,0.0,0.0,0.03,0.02,1930-01-01 00:00:00,0.0
25%,1.0,859.0,0.0,139193800.0,13.0,7.0,8.0,0.0,0.0,100.0,0.57,0.32,0.53,0.05,0.0,0.1,0.04,2020-06-28 00:00:00,3.0
50%,1.0,2226.0,3.0,284908300.0,34.0,38.0,30.0,0.0,2.0,121.0,0.7,0.51,0.66,0.17,0.0,0.12,0.06,2022-04-08 00:00:00,5.0
75%,2.0,5542.0,16.0,674072700.0,85.0,85.0,99.0,2.0,35.0,142.0,0.78,0.7,0.76,0.42,0.0,0.24,0.12,2022-11-04 00:00:00,8.0
max,8.0,52898.0,147.0,3703895000.0,672.0,275.0,965.0,46.0,1451.0,206.0,0.96,0.97,0.97,0.97,0.91,0.97,0.64,2023-07-14 00:00:00,10.0
std,0.864634,7944.032885,19.194211,571485500.0,86.562405,50.407409,169.396393,5.770938,150.414177,28.196567,0.146532,0.235946,0.160563,0.256881,0.085707,0.135695,0.100909,,3.227441


In [16]:
df.head(10)

Unnamed: 0,track_name,artist(s)_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,...,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,release_date,mode_Major,mode_Minor,key_encoded
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,553,147,141381703,43,263,45,10,...,0.89,0.83,0.31,0.0,0.08,0.04,2023-07-14,True,False,2
1,LALA,Myke Towers,1,1474,48,133716286,48,126,58,14,...,0.61,0.74,0.07,0.0,0.1,0.04,2023-03-23,True,False,3
2,vampire,Olivia Rodrigo,1,1397,113,140003974,94,207,91,14,...,0.32,0.53,0.17,0.0,0.31,0.06,2023-06-30,True,False,7
3,Cruel Summer,Taylor Swift,1,7858,100,800840817,116,207,125,12,...,0.58,0.72,0.11,0.0,0.11,0.15,2019-08-23,True,False,0
4,WHERE SHE GOES,Bad Bunny,1,3133,50,303236322,84,133,87,15,...,0.23,0.8,0.14,0.63,0.11,0.06,2023-05-18,False,True,0
5,Sprinter,"Dave, Central Cee",2,2186,91,183706234,67,213,88,17,...,0.66,0.58,0.19,0.0,0.08,0.24,2023-06-01,True,False,3
6,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2,3090,50,725980112,34,222,43,13,...,0.83,0.76,0.48,0.0,0.08,0.03,2023-03-16,False,True,7
7,Columbia,Quevedo,1,714,43,58149378,25,89,30,13,...,0.26,0.71,0.37,0.0,0.11,0.04,2023-07-07,True,False,7
8,fukumean,Gunna,1,1096,83,95217315,60,210,48,11,...,0.22,0.62,0.12,0.0,0.28,0.09,2023-05-15,False,True,3
9,La Bebe - Remix,"Peso Pluma, Yng Lvcas",2,2953,44,553634067,49,110,66,13,...,0.56,0.48,0.21,0.0,0.08,0.33,2023-03-17,False,True,4
