In [None]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data.csv")
df_genre = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_by_genres.csv")
df_year = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_by_year.csv")
df_artist = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_by_artist.csv")

**I just found this fancy EAD tool. It integrated describe, findna, corr, head, tail and duplicate functions from pandas.We can easily take a look on detailed dataset, variable, interactions, correlations, missing values, sample and duplicate rows. Sweetviz and PandasGUI are other two EAD tools serving similar functions**

1. We noticed there are sveral strong pos/neg correlations between variables. 
2. We can find patterns in certain variables， e.g. acousticness shows a U pattern and the danceability shows a uniformed pattern. 

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Sportify Music Pandas Profiling Report', explorative = True)
profile

In [None]:
#To further explore the correlation between popularity and other features, we can extract the correlation out. 
import plotly
import plotly.express as px

df_popu_corr = pd.DataFrame(df.corr()['popularity'][:]).reset_index().sort_values(by="popularity")
fig = px.bar(df_popu_corr, x="index", y="popularity", color="popularity", width=800, height=400)
fig.show()

In [None]:
# From the above chart we can see the year, loudness, energy, instrumentalness and acousticness are import variables. 
# This does not mean we should exclude the other features becuase the recommendation system intends to find similar music taste instead of predicting popularity.
# We firstly can create a decade varibale for future use. 
def get_decade(year):
    first_year = int(year/10)*10
    decade = str(first_year)+"s"
    return decade
df["decade"] = df["year"].apply(get_decade)
fig = px.histogram(df.sort_values("decade"),x="decade",color="decade", width=800, height=400)
fig.show()

In [None]:
popu_year_df = pd.DataFrame(df.groupby(by=["decade"]).mean()["popularity"]).reset_index()
popu_year_df
fig = px.bar(popu_year_df, y="decade", x="popularity", color="popularity",width=800, height=400)
fig.show()
#So the popular songs are mostly from 1960s to 2020s(Considering the 2020s data is not complete)

In [None]:
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(df_year, x='year', y=sound_features, width=800, height=400)
fig.show()

In [None]:
df_genre["genres"].nunique() #Unfortunately the data did not provide us a more general classification of songs. 
#We have to use cluster method to determine a further general class. 
#We will use K-Means to classify 20 geres. 

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

#Get the numerical data
X = df_genre.select_dtypes(np.number)

cluster_pipeline = Pipeline([("scaler", StandardScaler()), \
                             ("kmeans", KMeans(n_clusters=20))])
cluster_pipeline.fit(X)
df_genre["general_genre"] = cluster_pipeline.predict(X)

In [None]:
#Next we are clustering the songs with the K-Means we trained.

#This X is from df and the former X was from df_genre

X = df.select_dtypes(np.number)

#number_cols = list(X.columns)
cluster_pipeline.fit(X)
cluster_labels = cluster_pipeline.predict(X)
df['general_genre'] = cluster_labels

In [None]:
#We can visualize the k-means outcome and see how well it performs
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), \
                         ('PCA', PCA(n_components=3))])
genre_PCA = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=["P1", "P2", "P3"], data=genre_PCA)
projection['cluster'] = df['general_genre']

fig = px.scatter_3d(projection, x="P1", y="P2", z="P3", \
                    color='cluster', template="plotly_dark",
                   width=600, height=600)
fig.show()
#Kmeans is doing a great job. Sincerely this is one of the best results I get from clustering methods. Most t-SNE and K-Means lead to a "mess" on real world data. 

## Building the recommendation system
We will build our recommendation system based on the genre we get and the distance from numerical features.
The logics are:
1. We filter songs with the same genre cluster.
2. If there is only one song, we return this song.
3. If there is more than one song, we choose the one with smallest distance.

In [None]:
distance_cols = list(X.columns)
df[distance_cols] = df[distance_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df.drop_duplicates(subset="name", keep="first", inplace=True)

In [None]:
def get_recommendations(song_name, amount=1):
    song_data = df[df["name"]==song_name]
    song_data = song_data.select_dtypes(np.number)
    genre = int(song_data["general_genre"])
    song_data.drop("general_genre", axis=1, inplace=True)
    df_1 = df[df["name"]!=song_name]
    df_1 = df_1[df_1["general_genre"]==genre]
    #We create a df_2 so we can track the song name in df_1
    df_2=df_1[distance_cols]
    
    #Now we get the distance
    point_a = np.array(song_data)
    distance=[]
    for i in range(df_2.shape[0]):
        point_b = np.array(df_2.iloc[i])
        song_distance = np.linalg.norm(point_a-point_b)
        distance.append(song_distance)
    df_1["distance"]=distance
    df_1.sort_values(by="distance",ascending=True, inplace=True)
    df_1.reset_index(inplace=True)
    
    #Now we print out the song recommendation
    rec_song = df_1.loc[0:amount]
    print("Based on your preference, we recommend ")
    for i in range(amount):
        print(rec_song.loc[i]["name"] + " by "+rec_song.loc[i]["artists"])
        
#Lets try this out
get_recommendations("Keep A Song In Your Soul",10)

**This is really amazing. If you check out "Keep a song in your soul" and "Let's do it by Eddie Heywood& His Orchestra," they are really similar in style, beat, and rhythm. They are both light jazz in the 1930s to 1950s.**

**There is a recommendation system in one music platform that I find fantastic, which is Netease Music. The platform recommends around 20 songs to users every day. I personally think it is user-based and pre-filtered. It only recommends songs that I have never listened to but always go into my favorite list.**