In [1]:
import os
from distributed import Client
import modin.pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

os.environ["MODIN_ENGINE"] = "dask"
client = Client(n_workers=2)

In [2]:
data = pd.read_csv("data/tracks.csv")

In [3]:
data.head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [4]:
data.isnull().sum()

id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64

In [5]:
data[data['popularity'] == 0].value_counts().sum()

To request implementation, send an email to feature_requests@modin.org.


44631

In [6]:
data = data[data['popularity'] > 0]

In [7]:
data[data['popularity'] == 0].value_counts().sum()



0

In [8]:
data[data['duration_ms'] == 0].value_counts().sum()

0

In [9]:
data.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0,541982.0
mean,29.843386,230303.7,0.046647,0.566804,0.55735,5.225443,-9.9573,0.65588,0.101282,0.423207,0.094577,0.213852,0.554954,118.99647,3.881042
std,17.247144,117925.3,0.210882,0.164606,0.246495,3.521705,4.918093,0.475081,0.173532,0.336427,0.244916,0.18572,0.256752,29.545297,0.455705
min,1.0,14708.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16.0,176853.0,0.0,0.458,0.371,2.0,-12.553,0.0,0.0336,0.0879,0.0,0.0974,0.349,96.047,4.0
50%,29.0,217827.0,0.0,0.58,0.567,5.0,-9.036,1.0,0.0436,0.383,1.6e-05,0.138,0.566,117.976,4.0
75%,42.0,265172.8,0.0,0.687,0.758,8.0,-6.377,1.0,0.0743,0.738,0.00442,0.278,0.772,136.97275,4.0
max,100.0,5621218.0,1.0,0.991,1.0,11.0,5.376,1.0,0.971,0.996,1.0,1.0,1.0,246.381,5.0


In [10]:
data['year'] = pd.DatetimeIndex(data['release_date']).year

In [11]:
nums = [i for i in data.columns if data[i].dtype == int or data[i].dtype == float]
nums 

['danceability',
 'energy',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo']

In [12]:
import numpy as np 
X = data[nums].values
y = data['popularity'].values.ravel()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [37]:
lr_pipe = Pipeline([('scaler', StandardScaler()), ('Linear Regression', RandomForestRegressor())])

In [38]:
cross_val_score(lr_pipe, X, y, cv=10)

KeyboardInterrupt: 

In [35]:
lr_pipe.fit(X_train, y_train)
y_pred = lr_pipe.predict(X_test)
lr_pipe.score(X_test, y_pred)

1.0

In [28]:
print(f"{mean_squared_error(y_test, y_pred)}\n{mean_absolute_error(y_test, y_pred)}")

246.32079748149448
12.6872718873227


In [14]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [22]:
with mlflow.start_run():
        lr = RandomForestRegressor()
        lr.fit(X_train, y_train)

        predicted_qualities = lr.predict(X_test)

        (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

NameError: name 'alpha' is not defined

In [29]:
plt.plot(data[data['year'].astype(int) > 2018]['year'] , data[data['year'].astype(int) > 2018]['energy'])
plt.show()

ValueError: Inputs contain futures that were created by another client.

In [21]:
client.retire_workers()

{'tcp://127.0.0.1:61343': {'type': 'Worker',
  'id': 1,
  'host': '127.0.0.1',
  'resources': {},
  'local_directory': 'c:\\Users\\Aidan\\Spotify-Exploration\\dask-worker-space\\worker-6zui0ovz',
  'name': 1,
  'nthreads': 6,
  'memory_limit': 8276770816,
  'last_seen': 1624383068.9945853,
  'services': {'dashboard': 61344},
  'metrics': {'executing': 0,
   'in_memory': 4,
   'ready': 0,
   'in_flight': 0,
   'bandwidth': {'total': 100000000, 'workers': {}, 'types': {}},
   'spilled_nbytes': 0,
   'cpu': 0.0,
   'memory': 211931136,
   'time': 1624383068.989495,
   'read_bytes': 0.0,
   'write_bytes': 0.0},
  'nanny': 'tcp://127.0.0.1:61334'}}