## In this part some ML-Approaches should be used to create a model to improve the popularity of songs

In [11]:
# General tools
import os
from pathlib import Path
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# To save the models
from joblib import dump, load

# For transformations and predictions
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import pairwise_distances
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor

# For Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# For scoring
from sklearn.metrics import mean_squared_error as mse
from sklearn import metrics

# For validation
from sklearn.model_selection import train_test_split as train_test_split

# For plotting the tree graph
from sklearn import tree

# package used to calculate Shap values
import shap 

In [2]:
def Data_import():
    base_dir = Path.cwd()
    filename = Path('data_cleaned.csv')
    path = os.path.join(base_dir, filename)
    return pd.read_csv(path)

In [3]:
def Data_preprossesing(df, Optimization):
    np.random.seed(10)
    remove_n = 0
    drop_indices = np.random.choice(df.index, remove_n, replace=False)

    df = df.drop(drop_indices)
    if Optimization == "No":
        X = df.drop(['popularity','artists', 'id_artists', 'id','name','release_date'], axis=1)
    elif Optimization == "year_dropped":
        X = df.drop(['popularity','artists', 'id_artists','id','name','release_date','year'], axis=1)
    else:
        X = df.drop(['popularity','artists', 'id_artists', 'id','name','release_date'], axis=1)
    y = df['popularity']
    return train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
def Lineare_Regression(X_train, y_train):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return regressor.predict(X_test).clip(0,100), Lineare_Regression.__name__ , regressor

In [5]:
def KNN(X_train, y_train):
    RMSE1_train, RMSE1_test = [], []  
    for i in range(5,101,5):
        knn = KNeighborsRegressor(n_neighbors=i)
        
        knn.fit(X_train,y_train)
        y_train_pred = knn.predict(X_train)
        knn_train_rmse = np.sqrt(mse(y_train, y_train_pred))
        RMSE1_train.append(knn_train_rmse.round(3))
        y_test_pred = knn.predict(X_test)
        knn_test_rmse = np.sqrt(mse(y_test, y_test_pred))
        RMSE1_test.append(knn_test_rmse.round(3))
    return knn.predict(X_test).clip(0,100),KNN.__name__ ,knn

In [6]:
def Decision_Tree(X_train, y_train):
    tree = DecisionTreeRegressor(max_leaf_nodes=41, min_samples_split=2000)
     
    tree.fit(X_train, y_train)
    y_train_pred = tree.predict(X_train).clip(0, 100)
    train_rmse = np.sqrt(mse(y_train, y_train_pred))
    y_test_pred = tree.predict(X_test).clip(0, 100)
    test_rmse = np.sqrt(mse(y_test, y_test_pred))
    return tree.predict(X_test).clip(0, 100), Decision_Tree.__name__, tree

In [7]:
def XGBoost(X_train, y_train):
    xgb_regressor = XGBRegressor(n_estimators=100, max_depth=200, learning_rate=0.01)
    xgb_regressor.fit(X_train, y_train)
    return xgb_regressor.predict(X_test), XGBoost.__name__, xgb_regressor

In [8]:
def Scoring(y_test, y_pred):
    MAE = metrics.mean_absolute_error(y_test, y_pred)
    MSE =  metrics.mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    return MAE, MSE, RMSE

In [9]:
# saves the model for later analysis
def save_model(str_Verfahren,  Optimierung, y_pred, y_test):
    base_dir = Path.cwd()
    filename = Path(str_Verfahren+ '_' + Optimierung + '.joblib')
    path = os.path.join(base_dir, filename)
    dump(model,path)
    
    series_y_pred = pd.Series(y_pred)
    df_y_pred = pd.DataFrame(series_y_pred)
    df_y_pred.columns = ["y_pred"]

    df_y_test = pd.DataFrame(y_test)
    df_y_test.columns = ["y_test"]
    df_y_test.reset_index(drop=True, inplace=True)
    merged_df = df_y_pred.merge(df_y_test, left_index=True, right_index=True)
    filename = Path('y_pred_'+str_Verfahren+ '_' + Optimierung + '.xlsx')
    path = os.path.join(base_dir, filename)
    merged_df.to_excel(path)
    return print(str_Verfahren + ' with modification ' + Optimierung +" saved")

### This function uses all ML-Algorithms

In [41]:
list_algo = [Lineare_Regression, KNN, Decision_Tree, XGBoost]

list_modifications = ["No", "year_dropped"]

temp_list = []

df = Data_import()
for modification in list_modifications:
    X_train, X_test, y_train, y_test = Data_preprossesing(df, modification)
    for Algorithmen in list_algo:
        y_pred, str_Verfahren, model = Algorithmen(X_train, y_train) 
        save_model(str_Verfahren, modification, y_pred, y_test)
        MAE, MSE, RMSE = Scoring(y_test, y_pred)
        temp_list.append([str_Verfahren, modification, MAE, MSE, RMSE])

Scoring_Table = pd.DataFrame(temp_list, columns=['Algorithm', 'Modification', 'MAE', 'MSE', 'RMSE'])
Scoring_Table.transpose()

Lineare_Regression with modificationNo saved
KNN with modificationNo saved
Decision_Tree with modificationNo saved
XGBoost with modificationNo saved
Lineare_Regression with modificationyear_dropped saved
KNN with modificationyear_dropped saved
Decision_Tree with modificationyear_dropped saved
XGBoost with modificationyear_dropped saved


Unnamed: 0,0,1,2,3,4,5,6,7
Algorithm,Lineare_Regression,KNN,Decision_Tree,XGBoost,Lineare_Regression,KNN,Decision_Tree,XGBoost
Modification,No,No,No,No,year_dropped,year_dropped,year_dropped,year_dropped
MAE,11.078659,13.205403,10.518684,13.873164,13.038703,14.78789,12.571853,15.071534
MSE,207.45764,261.238083,191.522931,325.616319,259.157865,321.808185,245.514395,371.879879
RMSE,14.40339,16.162861,13.839181,18.044842,16.098381,17.939013,15.668899,19.284187


## Deeper analysis of models

In [10]:
def Model_import(str_Verfahren, Optimierung):
    base_dir = Path.cwd()
    filename = Path(str_Verfahren+ '_' + Optimierung + '.joblib')
    path = os.path.join(base_dir, filename)
    return load(path)

In [6]:
model = Model_import('XGBoost', 'year_dropped')
data = Data_import()
X_train, X_test, y_train, y_test = Data_preprossesing(data, 'year_dropped')

In [None]:
shap_values = shap.TreeExplainer(model).shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar", plot_size=(15,15), auto_size_plot=True)

In [None]:
plt.rcParams.update({'font.size': 40})
plt.rcParams.update({'font.weight': 1000})
shap.summary_plot(shap_values, X_train,plot_size=(17,13),show=False)
plt.savefig('SHAP_impact_on_output.pdf',bbox_inches='tight',transparent=True)

In [None]:
%matplotlib inline
from pylab import rcParams
shap.initjs()
j =5 
explainerModel = shap.TreeExplainer(model)
shap_values_Model = explainerModel.shap_values(S)
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'font.size': 10})
plt.rcParams.update({'font.weight': 1000})
plt.rcParams['axes.titlesize'] = 20
plt.rcParams.update({"figure.figsize": (20,10)})
fig = shap.force_plot(explainerModel.expected_value.round(2), shap_values_Model[j,:].round(2), S.iloc[j,:].round(3), show=True,matplotlib=True,figsize=(19, 6),contribution_threshold=0.1) 
plt.savefig('SHAP_strahl.pdf',bbox_inches='tight',transparent=True)

In [42]:
# Dient zu Feature Importance analyse
print('Feature importances:')
for i, col in enumerate(X_train.columns):
  print(f'{col:12}: {model.feature_importances_[i]:.3f}')


Feature importances:
duration_ms : 0.003
explicit    : 0.914
danceability: 0.004
energy      : 0.004
key         : 0.003
loudness    : 0.010
mode        : 0.004
speechiness : 0.004
acousticness: 0.028
instrumentalness: 0.006
liveness    : 0.005
valence     : 0.007
tempo       : 0.005
time_signature: 0.004


In [None]:
# Speichert den Decision Tree als PDF
features_names = X_train.columns
plt.subplots(figsize=(70, 30))
tree.plot_tree(model, max_depth=2,feature_names = features_names, filled=True,precision=2,fontsize=55) 
plt.savefig('Decision_tree_grafisch_Jahr_gedroppt.pdf')