### Creating a base model for Decision Tree and Linear Regression


In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor


In [3]:
data_path = 'playlist_2010to2022.csv'
playlist_df = pd.read_csv(data_path)

# Display the first few rows of the dataframe
playlist_df.head()

Unnamed: 0,playlist_url,year,track_id,track_name,track_popularity,album,artist_id,artist_name,artist_genres,artist_popularity,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,https://open.spotify.com/playlist/37i9dQZF1DWU...,2000,3AJwUDP919kvQ9QcozQPxg,Yellow,91,Parachutes,4gzpq5DPGxSnKTe4SA8HAU,Coldplay,"['permanent wave', 'pop']",86,...,-7.227,1.0,0.0281,0.00239,0.000121,0.234,0.285,173.372,266773.0,4.0
1,https://open.spotify.com/playlist/37i9dQZF1DWU...,2000,2m1hi0nfMR9vdGC8UcrnwU,All The Small Things,84,Enema Of The State,6FBDaR13swtiWwGhX1WQsP,blink-182,"['alternative metal', 'modern rock', 'pop punk...",75,...,-4.918,1.0,0.0488,0.0103,0.0,0.612,0.684,148.726,167067.0,4.0
2,https://open.spotify.com/playlist/37i9dQZF1DWU...,2000,3y4LxiYMgDl4RethdzpmNe,Breathe,69,Breathe,25NQNriVT2YbSW80ILRWJa,Faith Hill,"['contemporary country', 'country', 'country d...",61,...,-9.007,1.0,0.029,0.173,0.0,0.251,0.278,136.859,250547.0,4.0
3,https://open.spotify.com/playlist/37i9dQZF1DWU...,2000,60a0Rd6pjrkxjPbaKzXjfq,In the End,88,Hybrid Theory (Bonus Edition),6XyY86QOPPrYVGvF9ch6wz,Linkin Park,"['alternative metal', 'nu metal', 'post-grunge...",83,...,-5.87,0.0,0.0584,0.00958,0.0,0.209,0.4,105.143,216880.0,4.0
4,https://open.spotify.com/playlist/37i9dQZF1DWU...,2000,62bOmKYxYg7dhrC6gH9vFn,Bye Bye Bye,74,No Strings Attached,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,"['boy band', 'dance pop', 'pop']",65,...,-4.843,0.0,0.0479,0.031,0.0012,0.0821,0.861,172.638,200400.0,4.0


In [5]:
playlist_df.dropna(inplace=True) #drop all null values
playlist_df.columns

Index(['playlist_url', 'year', 'track_id', 'track_name', 'track_popularity',
       'album', 'artist_id', 'artist_name', 'artist_genres',
       'artist_popularity', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')

In [6]:
cat_cols = ['playlist_url', 'album', 'artist_name', 'artist_genres', 'artist_id', 'track_name', 'track_id']
num_cols = ['year', 'track_popularity', 'artist_id', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature']


In [7]:
playlist_df_cat = playlist_df[cat_cols]
playlist_df_cat.head()

Unnamed: 0,playlist_url,album,artist_name,artist_genres,artist_id,track_name,track_id
0,https://open.spotify.com/playlist/37i9dQZF1DWU...,Parachutes,Coldplay,"['permanent wave', 'pop']",4gzpq5DPGxSnKTe4SA8HAU,Yellow,3AJwUDP919kvQ9QcozQPxg
1,https://open.spotify.com/playlist/37i9dQZF1DWU...,Enema Of The State,blink-182,"['alternative metal', 'modern rock', 'pop punk...",6FBDaR13swtiWwGhX1WQsP,All The Small Things,2m1hi0nfMR9vdGC8UcrnwU
2,https://open.spotify.com/playlist/37i9dQZF1DWU...,Breathe,Faith Hill,"['contemporary country', 'country', 'country d...",25NQNriVT2YbSW80ILRWJa,Breathe,3y4LxiYMgDl4RethdzpmNe
3,https://open.spotify.com/playlist/37i9dQZF1DWU...,Hybrid Theory (Bonus Edition),Linkin Park,"['alternative metal', 'nu metal', 'post-grunge...",6XyY86QOPPrYVGvF9ch6wz,In the End,60a0Rd6pjrkxjPbaKzXjfq
4,https://open.spotify.com/playlist/37i9dQZF1DWU...,No Strings Attached,*NSYNC,"['boy band', 'dance pop', 'pop']",6Ff53KvcvAj5U7Z1vojB5o,Bye Bye Bye,62bOmKYxYg7dhrC6gH9vFn


Lets look into artist_genres a bit more and see if we can change it into a numerical factor for our analysis. I expect some genres are more popular then others such as country vs pop. 

In [9]:
genres = []

for index, row in playlist_df_cat.iterrows():
    print(row['artist_genres'])
    break 
    # Find a way to get all unique genres and provide them a number
    # To categorize them
    

['permanent wave', 'pop']


In [13]:
playlist_df.drop(cat_cols[:-1], axis=1, inplace=True)
playlist_df.head()

Unnamed: 0,year,track_id,track_popularity,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2000,3AJwUDP919kvQ9QcozQPxg,91,86,0.429,0.661,11.0,-7.227,1.0,0.0281,0.00239,0.000121,0.234,0.285,173.372,266773.0,4.0
1,2000,2m1hi0nfMR9vdGC8UcrnwU,84,75,0.434,0.897,0.0,-4.918,1.0,0.0488,0.0103,0.0,0.612,0.684,148.726,167067.0,4.0
2,2000,3y4LxiYMgDl4RethdzpmNe,69,61,0.529,0.496,7.0,-9.007,1.0,0.029,0.173,0.0,0.251,0.278,136.859,250547.0,4.0
3,2000,60a0Rd6pjrkxjPbaKzXjfq,88,83,0.556,0.864,3.0,-5.87,0.0,0.0584,0.00958,0.0,0.209,0.4,105.143,216880.0,4.0
4,2000,62bOmKYxYg7dhrC6gH9vFn,74,65,0.61,0.926,8.0,-4.843,0.0,0.0479,0.031,0.0012,0.0821,0.861,172.638,200400.0,4.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(playlist_df.drop(['track_popularity', 'track_id'], axis=1),playlist_df['track_popularity'], test_size=0.2)

In [15]:
print('Train Shapes:', X_train.shape, y_train.shape)
print('Test Shapes:', X_test.shape, y_test.shape)

Train Shapes: (1839, 15) (1839,)
Test Shapes: (460, 15) (460,)


#### Creating a Basemodel for Linear Regression and Decision Tree

In [16]:
lin_reg = LinearRegression()
cross_val_score(lin_reg, X_train, y_train, cv=3)

array([0.07133817, 0.10767511, 0.09865124])

In [17]:
tree_reg = DecisionTreeRegressor()
cross_val_score(tree_reg, X_train, y_train, cv=3)

array([-0.67316935, -0.37539043, -0.58830187])

In [18]:
# Hyperparamter tunning, Finding a way to significally improve model