In [1]:
# import dependencies
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Read the CSV

In [2]:
# import the data (obtained csv from Kaggle), describe data
data = pd.read_csv('data.csv')
data.describe()

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target
count,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0
mean,1008.0,0.18759,0.618422,246306.2,0.681577,0.133286,5.342588,0.190844,-7.085624,0.612295,0.092664,121.603272,3.96827,0.496815,0.505702
std,582.402066,0.259989,0.161029,81981.81,0.210273,0.273162,3.64824,0.155453,3.761684,0.487347,0.089931,26.685604,0.255853,0.247195,0.500091
min,0.0,3e-06,0.122,16042.0,0.0148,0.0,0.0,0.0188,-33.097,0.0,0.0231,47.859,1.0,0.0348,0.0
25%,504.0,0.00963,0.514,200015.0,0.563,0.0,2.0,0.0923,-8.394,0.0,0.0375,100.189,4.0,0.295,0.0
50%,1008.0,0.0633,0.631,229261.0,0.715,7.6e-05,6.0,0.127,-6.248,1.0,0.0549,121.427,4.0,0.492,1.0
75%,1512.0,0.265,0.738,270333.0,0.846,0.054,9.0,0.247,-4.746,1.0,0.108,137.849,4.0,0.691,1.0
max,2016.0,0.995,0.984,1004627.0,0.998,0.976,11.0,0.969,-0.307,1.0,0.816,219.331,5.0,0.992,1.0


In [3]:
# show the data in a dataframe
data.head()
# data.dtypes

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future
1,1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1,Xanny Family,Future
3,3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1,Master Of None,Beach House
4,4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys


In [4]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

y = data["target"]
X = data.drop(columns=["target", 'song_title', 'artist'])


print("Shape: ", X.shape, y.shape)

Shape:  (2017, 14) (2017,)


# Create a Train Test Split

In [5]:
# Use sklearn's `train_test_split` to split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


## Pre-Processing

In [6]:
# scale the data using the MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


## Create and Train the Logistic Regression Model

In [7]:
# Create and fit the model

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
# Print the scores

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9358465608465608
Testing Data Score: 0.9485148514851485


In [None]:
# plot the model scores


In [9]:
# fit the scaled model

model2 = LogisticRegression()
model2.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
# print the scaled scores

print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.9854497354497355
Testing Data Score: 0.9900990099009901


In [11]:
# plot the model scores


In [12]:
# Use our model to predict a value
predicted = model.predict(X_test[:10])
predicted

array([0, 1, 1, 0, 1, 0, 1, 1, 0, 1], dtype=int64)

In [13]:
X_test[:10]

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
1555,1555,0.748,0.52,341667,0.0748,0.633,8,0.102,-24.477,1,0.0497,107.327,4.0,0.134
526,526,0.0726,0.739,386907,0.526,0.0,4,0.215,-7.384,0,0.101,143.948,4.0,0.374
393,393,0.0021,0.646,219754,0.892,0.00626,10,0.117,-5.015,1,0.0331,119.998,4.0,0.935
1788,1788,0.789,0.664,145707,0.32,0.0,4,0.152,-7.356,1,0.0322,141.916,3.0,0.71
433,433,0.0198,0.517,245013,0.491,1e-06,11,0.0786,-13.742,0,0.0341,104.996,4.0,0.331
1159,1159,0.281,0.718,214867,0.609,3.3e-05,0,0.234,-4.699,0,0.0429,122.948,4.0,0.47
1090,1090,0.27,0.639,307910,0.869,0.0,11,0.0802,-4.024,1,0.147,169.801,4.0,0.766
429,429,0.0571,0.502,287827,0.632,0.054,4,0.226,-9.971,0,0.271,82.738,4.0,0.207
1801,1801,0.0637,0.406,224848,0.638,0.0,9,0.107,-6.085,0,0.13,177.916,4.0,0.404
530,530,7.8e-05,0.351,141305,0.931,0.129,6,0.226,-5.746,0,0.0539,126.394,4.0,0.604


In [14]:
y_test[:10]

1555    0
526     1
393     1
1788    0
433     1
1159    0
1090    0
429     1
1801    0
530     1
Name: target, dtype: int64

In [15]:
predicted = model2.predict(X_test[:20])
predicted

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [16]:
y_test[:20]

1555    0
526     1
393     1
1788    0
433     1
1159    0
1090    0
429     1
1801    0
530     1
1208    0
1454    0
1910    0
1619    0
1022    0
678     1
1477    0
124     1
1900    0
462     1
Name: target, dtype: int64

# Hyperparameter Tuning

In [17]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [18]:
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.9940476190476191, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.9940476190476191, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.9821428571428571, total=   0.0s




[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.9464285714285714, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.9325396825396826, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.9246031746031746, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ........ C=5, penalty=l1, score=0.9940476190476191, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ....................... C=5, penalty=l1, score=1.0, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ........ C=5, penalty=l1, score=0.9880952380952381, total=   0.0s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.9464285714285714, total=   0.0s
[CV] 



[CV] ....... C=10, penalty=l1, score=0.9880952380952381, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.9464285714285714, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.9325396825396826, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.9246031746031746, total=   0.0s


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [19]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'penalty': 'l1'}
0.9940476190476191
