In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [2]:
# load dataset
data = pd.read_csv('../input/spotify/spotify final datasets.csv')
data.head()

### Data structure

# EDA

In [3]:
type(data)

In [4]:
# check how many rows and columns
data.shape

In [5]:
# data types
data.dtypes

In [6]:
data.info()

In [7]:
data[data.duplicated()]

In [8]:
# Remove duplicated rows
data1 = data.drop_duplicates()
data1.duplicated().sum()

In [9]:
# Remove the unnecessary columans
data_clean = data1.drop('uri',axis =1)
data_clean.head()

In [10]:
data2 = data_clean.drop('track_number', axis =1)
data2.head()

In [11]:
# checking for null value present or not
data2.isnull().sum()

In [12]:
df = data2.fillna(data2.mode().iloc[0])

In [13]:
df.isnull().sum()

In [14]:
# to check corr
corr = df.corr()
corr

In [15]:
# create heat map using corr
sns.heatmap(corr, xticklabels = corr.columns, yticklabels=corr.columns, annot  = True)

In [16]:
df.describe()

In [17]:
df.nunique()

###  Visualization 

In [18]:
sns.pairplot(df)

In [19]:
df['popularity'].hist()

In [20]:
df.boxplot(column = ['popularity'])

# Model Building

In [21]:
df.head()

In [22]:
popu = pd.DataFrame(df.groupby('name')['popularity'].mean())
popu.head()

In [23]:
matrics= df.pivot_table(index = 'album', columns = 'name', values ='popularity')
matrics

In [24]:
matrix_2 = matrics.fillna(0)
matrix_2

In [25]:
popu.sort_values('popularity', ascending = False).head(10)

In [26]:
a = matrix_2["That's The Power - Live"]

In [27]:
similar_to_a = matrix_2.corrwith(a)
similar_to_a

In [28]:
corr_a = pd.DataFrame(similar_to_a , columns=['corr'])
corr_a.dropna(inplace=True)
corr_a.head()

In [29]:
corr_a_2 = corr_a.join(popu['popularity'])
corr_a_2

In [30]:
corr_a_2[corr_a_2['popularity']>20].sort_values(by='corr', ascending = False)

In [31]:
pickle.dump(df.to_dict(),open('spotify.pkl','wb'))

# Model Evoluation

In [32]:
# create new column 'rating'
df['popularity'] = df['rating']= data.popularity.map(lambda x: 1 if x > 5 else 0)
df.head()

In [33]:
# drop unnecessary columns
df1 = df.drop(['popularity'], axis = 1)
df1.head()

In [34]:
df1['rating'].value_counts()

In [35]:
d1 = df1.drop('id',axis =1)
d1

In [36]:
x = d1.drop('rating',axis = 1)
y = d1['rating']

In [37]:
# create dummies for following columns
x1 = pd.get_dummies(x, columns =['acousticness','danceability','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence','album','name'])
x1.head()

In [38]:
y.value_counts()

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
x_train,x_test,y_train,y_test = train_test_split(x1,y, test_size = 0.25, random_state =0)

In [41]:
from sklearn.linear_model import LogisticRegression

In [42]:
clas = LogisticRegression()

In [43]:
clas.fit(x_train,y_train)

In [44]:
y_pred = clas.predict_proba(x_test)
y_pred

In [45]:
y_pred = clas.predict(x_test)
y_pred

In [46]:
from sklearn.metrics import confusion_matrix

#### Confusion Matrix
Confusion matrix is one of the most powerful and commonly used evaluation technique as it allows us to compute a whole lot of other metrics that allow us to evaluate the performance of a classification model.

In [47]:
# Creating a simple confusion matrix
cm = confusion_matrix(y_test,y_pred)
cm

In [48]:
# We will now represent the Confusion Matrix in a heat map format by using the seaborn package.
sns.heatmap(cm, annot = True, fmt = '.2f')
plt.ylabel('True label')
plt.xlabel('predicted label')

###### Various types of metrics can be calculated using the confusion matrix and in order to do so, we create a classification report.

In [49]:
from sklearn import metrics

In [50]:
print(metrics.classification_report(y_test, y_pred, digits = 2))

#### Accuracy Score
Accuracy Score can be calculated using metrics.

In [51]:
metrics.accuracy_score(y_test,y_pred)

#### The accuracy score for the logistic regression model comes out to be 0.80

### AUC and ROC

In [52]:
# AUC
predict_proba = pd.DataFrame(clas.predict_proba(x_test))
predict_proba.head()

In [53]:
# We will now make a dataset containing actual values of the Survived variable (dependent variable), predicted values and probabilities of survival. To do this, we first start with converting predicted values of survival to a data frame for merging datasets.
pred_log = pd.DataFrame(y_pred)

In [54]:
# We now reset the index for y_test
y1_test = y_test.reset_index()

In [55]:
# We then concatenate datasets using pd.concat
predictions = pd.concat([y1_test,pred_log,predict_proba],axis = 1)

In [56]:
# Finally, the columns of the dataset are renamed and we get the final table that allows us to calculate the AUC score and create ROC Curve.
predictions.columns = ["index","actual","prediction","survived_0","survived_1"]

In [57]:
predictions.head()

In [58]:
# We use the above table to compute the AUC score i.e. the Area Under the Curve.
auc_score = metrics.roc_auc_score(predictions.actual,predictions.survived_1)
round(float(auc_score),2)

In [59]:
# We now calculate the Fals Positivity Rate, True Positivity Rate and Threshold and use them to plot the ROC curve(Receiver Operating Characteristic).
fpr,tpr,threshold = metrics.roc_curve(y_test,predictions.survived_1,drop_intermediate =False)
roc_auc = metrics.auc(fpr, tpr)

In [60]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='ROC curve (area = %0.2f)' % auc_score)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

As we can notice, the minimum difference between the False Positive and True Positive is when our sensitivity value is at 0.8. Now we will calculate the new cut off value based on this value of sensitivity and see how the accuracy of our model increases.

In [61]:
cutoff_prob = threshold[(np.abs(tpr - 0.8 )).argmin()]
round( float( cutoff_prob), 2)

###### The ideal cutoff for having the maximum sensitivity (True Positive Rate) and 1-specificity (False Positive Rate) comes out to be 0.28

In [62]:
# We will now predict the survival rate by using the new cut off value.
predictions['new_labels'] = predictions['survived_1'].map( lambda x: 1 if x >= 0.28 else 0 )
predictions.head()

In [63]:
# We create new Confusion Matrix with actual and new values.
cm1 = metrics.confusion_matrix( predictions.actual,
                          predictions.new_labels, [1,0] )
sns.heatmap(cm1, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"],)
plt.ylabel('True label',fontsize=12)
plt.xlabel('Predicted label',fontsize=12)

In [64]:
# We also calculate the new Accuracy Score.
metrics.accuracy_score(predictions.actual,predictions.new_labels, [1,0] )

### We notice a slight increase in our accuracy from roughly 0.80 to 0.86¶

### Deployment