### In this notebook, we incorporate textual features - artist and genre. They're first converted to vectors using TF-IDF algorithm. This textual data along with numerical features are used to train and test 3 classifiers- KNN,Decision Tree and Random Forest

In [1]:
def normalize(X):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X))

In [2]:
def tfidf(feature):
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer()
    feature_csr = vectorizer.fit_transform(feature)
    feature_tfidf=feature_csr.toarray()
    return feature_tfidf

In [3]:
def pca(k,X):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=k)
    X_pca=pca.fit_transform(X)
    X_pca_df=pd.DataFrame(X_pca)
    return X_pca_df

In [4]:
def knn(n,X_train,X_test,y_train,y_test):
    from sklearn.metrics import classification_report
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test, y_pred)

In [5]:
def DT(X_train, X_test, y_train, y_test):
    from sklearn.tree import DecisionTreeClassifier
    clf = DecisionTreeClassifier( random_state = 42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test, y_pred)

In [6]:
def RF(X_train, X_test, y_train, y_test):
    from sklearn.metrics import accuracy_score
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators = 100) 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

### Preprocessing step

In [7]:
import pandas as pd
data=pd.read_csv("spotify.csv")
data=data.dropna()
data["Popularity- The higher the value the more popular the song is"] = pd.cut(x=data['Popularity- The higher the value the more popular the song is'], bins = [-1, 50, 75, 99], labels =[0, 1, 2])
data.head()

Unnamed: 0,title,artist,the genre of the track,year,Beats.Per.Minute -The tempo of the song,"Energy- The energy of a song - the higher the value, the more energtic","Danceability - The higher the value, the easier it is to dance to this song","Loudness/dB - The higher the value, the louder the song","Liveness - The higher the value, the more likely the song is a live recording","Valence - The higher the value, the more positive mood for the song",Length - The duration of the song,Acousticness - The higher the value the more acoustic the song is,Speechiness - The higher the value the more spoken word the song contains,Popularity- The higher the value the more popular the song is
0,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,2
1,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,2
2,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,2
3,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,2
4,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,2


In [8]:
Y=data['Popularity- The higher the value the more popular the song is']

In [9]:
X_num=data.iloc[:,4:-1]

In [10]:
X_num=normalize(X_num)
X_num

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.470874,0.908163,0.690722,0.965517,0.108108,0.816327,0.286207,0.191919,0.083333
1,0.422330,0.948980,0.773196,0.948276,0.702703,0.653061,0.444828,0.242424,0.479167
2,0.582524,0.857143,0.783505,0.982759,0.391892,0.724490,0.227586,0.101010,0.291667
3,0.577670,0.938776,0.721649,0.965517,0.108108,0.724490,0.555172,0.000000,0.083333
4,0.529126,0.857143,0.659794,0.948276,0.121622,0.438776,0.300000,0.020202,0.083333
...,...,...,...,...,...,...,...,...,...
598,0.504854,0.673469,0.628866,0.913793,0.270270,0.163265,0.144828,0.010101,0.062500
599,0.461165,0.806122,0.773196,0.931034,0.094595,0.622449,0.248276,0.212121,0.250000
600,0.660194,0.775510,0.546392,0.948276,0.121622,0.663265,0.434483,0.070707,0.708333
601,0.553398,0.806122,0.618557,0.931034,0.567568,0.244898,0.286207,0.010101,0.145833


### TF-IDF step

In [11]:
artist_tfidf=tfidf(data['artist'])
artist_pca=pca(2,artist_tfidf)
artist_pca
X_new1=X_num.join(pd.DataFrame(artist_pca),lsuffix='r')
X_new1

Unnamed: 0,0r,1r,2,3,4,5,6,7,8,0,1
0,0.470874,0.908163,0.690722,0.965517,0.108108,0.816327,0.286207,0.191919,0.083333,-0.016094,-0.015070
1,0.422330,0.948980,0.773196,0.948276,0.702703,0.653061,0.444828,0.242424,0.479167,-0.017083,-0.016112
2,0.582524,0.857143,0.783505,0.982759,0.391892,0.724490,0.227586,0.101010,0.291667,-0.034273,-0.034632
3,0.577670,0.938776,0.721649,0.965517,0.108108,0.724490,0.555172,0.000000,0.083333,-0.064311,-0.085114
4,0.529126,0.857143,0.659794,0.948276,0.121622,0.438776,0.300000,0.020202,0.083333,-0.051639,-0.060381
...,...,...,...,...,...,...,...,...,...,...,...
598,0.504854,0.673469,0.628866,0.913793,0.270270,0.163265,0.144828,0.010101,0.062500,-0.018201,-0.017314
599,0.461165,0.806122,0.773196,0.931034,0.094595,0.622449,0.248276,0.212121,0.250000,-0.038468,-0.041431
600,0.660194,0.775510,0.546392,0.948276,0.121622,0.663265,0.434483,0.070707,0.708333,-0.019329,-0.018559
601,0.553398,0.806122,0.618557,0.931034,0.567568,0.244898,0.286207,0.010101,0.145833,-0.018201,-0.017314


In [12]:
genre_tfidf=tfidf(data['the genre of the track'])
genre_pca=pca(2,genre_tfidf)
X_new2=X_new1.join(pd.DataFrame(genre_pca),lsuffix='g')
X_new2

Unnamed: 0,0r,1r,2,3,4,5,6,7,8,0,1,0.1,1.1
0,0.470874,0.908163,0.690722,0.965517,0.108108,0.816327,0.286207,0.191919,0.083333,-0.016094,-0.015070,0.612633,-0.242834
1,0.422330,0.948980,0.773196,0.948276,0.702703,0.653061,0.444828,0.242424,0.479167,-0.017083,-0.016112,0.628435,-0.243370
2,0.582524,0.857143,0.783505,0.982759,0.391892,0.724490,0.227586,0.101010,0.291667,-0.034273,-0.034632,-0.390023,-0.059332
3,0.577670,0.938776,0.721649,0.965517,0.108108,0.724490,0.555172,0.000000,0.083333,-0.064311,-0.085114,-0.390023,-0.059332
4,0.529126,0.857143,0.659794,0.948276,0.121622,0.438776,0.300000,0.020202,0.083333,-0.051639,-0.060381,0.051772,0.579183
...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,0.504854,0.673469,0.628866,0.913793,0.270270,0.163265,0.144828,0.010101,0.062500,-0.018201,-0.017314,-0.390023,-0.059332
599,0.461165,0.806122,0.773196,0.931034,0.094595,0.622449,0.248276,0.212121,0.250000,-0.038468,-0.041431,0.051772,0.579183
600,0.660194,0.775510,0.546392,0.948276,0.121622,0.663265,0.434483,0.070707,0.708333,-0.019329,-0.018559,-0.390023,-0.059332
601,0.553398,0.806122,0.618557,0.931034,0.567568,0.244898,0.286207,0.010101,0.145833,-0.018201,-0.017314,-0.390023,-0.059332


### Model training and testing

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new2, Y, test_size=0.3,random_state=42)

In [14]:
print("KNN Accuracy:",knn(8,X_train, X_test, y_train, y_test))

KNN Accuracy: 0.6132596685082873


In [15]:
print("Decision Tree Accuracy:" ,DT(X_train, X_test, y_train, y_test))

Decision Tree Accuracy: 0.4530386740331492


In [16]:
print("Random Forest Accuracy",RF(X_train, X_test, y_train, y_test))

Random Forest Accuracy 0.6353591160220995
