# Importing the Datasets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

In [2]:
#Supress warnings

import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns',None)

# Reading the Data into dataframe

In [3]:
data=pd.read_csv(r'C:\Users\Saurabh\Data\Spotify.csv',header=0 , low_memory=False)

data.shape   #(855969, 73)


(170653, 19)

# EDA

In [4]:
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [5]:
data.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [6]:
neworder = ['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'release_date',
       'speechiness', 'tempo', 'popularity']
data=data.reindex(columns=neworder)

In [7]:
data['popularity'].value_counts()

0      27892
43      3136
44      3117
41      3078
40      3051
       ...  
94         4
93         4
99         1
97         1
100        1
Name: popularity, Length: 100, dtype: int64

# Checking Missing Values

In [8]:
total = data.isnull().sum().sort_values(ascending=False)
percent= (data.isnull().sum()/data.isnull().count()  * 100).sort_values(ascending=False)
missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data.head(32)

Unnamed: 0,Total,Percent
popularity,0,0.0
id,0,0.0
year,0,0.0
acousticness,0,0.0
artists,0,0.0
danceability,0,0.0
duration_ms,0,0.0
energy,0,0.0
explicit,0,0.0
instrumentalness,0,0.0


# Converting categorical variables into numerical ones

In [9]:
colname=[]
for x in data.columns:
    if data[x].dtype=='object':
        colname.append(x)
colname

['artists', 'id', 'name', 'release_date']

In [10]:
from sklearn import preprocessing

le=preprocessing.LabelEncoder()

for x in colname:
    data[x]=le.fit_transform(data[x])

In [11]:
data

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,release_date,speechiness,tempo,popularity
0,0.0594,1921,0.98200,26839,0.279,831667,0.211,0,96623,0.878000,10,0.6650,-20.096,1,83631,0,0.0366,80.954,4
1,0.9630,1921,0.73200,7382,0.819,180533,0.341,0,169794,0.000000,7,0.1600,-12.441,1,20291,0,0.4150,60.936,5
2,0.0394,1921,0.96100,16378,0.328,500062,0.166,0,43559,0.913000,3,0.1010,-14.850,1,38094,0,0.0339,110.339,5
3,0.1650,1921,0.96700,10077,0.275,210000,0.309,0,85809,0.000028,5,0.3810,-9.316,1,24147,0,0.0354,100.109,3
4,0.2530,1921,0.95700,23719,0.418,166693,0.193,0,105991,0.000002,3,0.2290,-10.096,1,123247,0,0.0380,101.665,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170648,0.6080,2020,0.08460,2159,0.786,301714,0.808,0,8059,0.000289,7,0.0822,-3.702,1,19540,11119,0.0881,105.029,72
170649,0.7340,2020,0.20600,2498,0.717,150654,0.753,0,9542,0.000000,7,0.1010,-6.020,1,41982,11219,0.0605,137.936,68
170650,0.6370,2020,0.10100,19497,0.634,211280,0.858,0,96709,0.000009,4,0.2580,-2.226,0,3324,11228,0.0809,91.688,76
170651,0.1950,2020,0.00998,8943,0.671,337147,0.623,1,123331,0.000008,2,0.6430,-7.161,1,24295,11038,0.3080,75.055,70


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  int32  
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  int32  
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  int32  
 15  release_date      170653 non-null  int32  
 16  speechiness       17

# Naming the independent and dependant variables

In [13]:
X=data.values[:,:-1]
Y=data.values[:,-1]
Y=Y.astype(int)


# Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)

X=scaler.transform(X)

# Spliting the data into train and test

In [15]:
from sklearn.model_selection import train_test_split

#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
random_state=10)

# Implementing Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(X, Y)
print(LR.intercept_)
print(LR.coef_)

31.43179434290634
[ 1.36677102e-01  1.82015406e+01 -1.52060040e+00 -3.50980965e-01
  4.68698130e-01 -6.25413221e-02 -4.66848746e-01  3.27277633e-01
  6.93967959e-01 -1.26305744e+00  2.32140825e-03 -5.20027737e-01
  1.65829755e-01 -9.72081562e-02 -1.01733096e-01 -9.62451046e-01
 -1.14560389e+00  5.44711794e-02]


In [23]:
Y_pred= LR.predict(X_test)
print(Y_pred)

[60.3508078  13.61485631 20.46798325 ... 41.7455372  25.81779056
 52.07422679]


In [24]:
from sklearn.metrics import r2_score,mean_squared_error
import numpy as np

r2=r2_score(Y_test,Y_pred)
print(r2)

rmse=np.sqrt(mean_squared_error(Y_test,Y_pred))
print(rmse)

adjusted_r_squared = 1 - (1-r2)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
print(adjusted_r_squared)

0.7578527041490722
10.731754060426939
0.7578271602872082


# Implementing ElasticNetCV

In [26]:
from sklearn.linear_model import ElasticNetCV
model = ElasticNetCV()
model.fit(X_train, Y_train)
Y_pred= model.predict(X_test)
print(Y_pred)

[60.66871911 13.74847601 19.4392065  ... 41.8291175  25.24766923
 52.67727198]


In [27]:
from sklearn.metrics import r2_score,mean_squared_error
import numpy as np

r2=r2_score(Y_test,Y_pred)
print(r2)

rmse=np.sqrt(mean_squared_error(Y_test,Y_pred))
print(rmse)

adjusted_r_squared = 1 - (1-r2)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
print(adjusted_r_squared)

0.7560489957075924
10.771649287002445
0.7560232615744346


# Implementing Ridge Regression

In [30]:
from sklearn.linear_model import Ridge
#create a model object
lm = Ridge()

#train the model object
lm.fit(X_train,Y_train)
Y_pred=lm.predict(X_test)

In [31]:


from sklearn.metrics import r2_score,mean_squared_error
import numpy as np

r2=r2_score(Y_test,Y_pred)
print(r2)

rmse=np.sqrt(mean_squared_error(Y_test,Y_pred))
print(rmse)

adjusted_r_squared = 1 - (1-r2)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
print(adjusted_r_squared)

0.7577897427718076
10.733149166033435
0.7577641922682145


# Implementing DT Regressor

In [16]:
from sklearn.tree import DecisionTreeRegressor

model_DecisionTree = DecisionTreeRegressor(
                                            random_state=10)

model_DecisionTree.fit(X_train,Y_train)

DecisionTreeRegressor(random_state=10)

In [17]:
Y_pred= model_DecisionTree.predict(X_test)
print(Y_pred)

[64.  0. 31. ... 57. 30. 41.]


In [18]:
from sklearn.metrics import r2_score,mean_squared_error
import numpy as np

r2=r2_score(Y_test,Y_pred)
print(r2)

rmse=np.sqrt(mean_squared_error(Y_test,Y_pred))
print(rmse)

adjusted_r_squared = 1 - (1-r2)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
print(adjusted_r_squared)

0.6466400614443769
12.964015900115893
0.6466027858785811
