# Libraries

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualisation
import seaborn as sns # for data visualisation
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, ttest_ind

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Preparation

In [None]:
# Importing data to pandas dataframe
filename='/kaggle/input/top50spotify2019/top50.csv'
df=pd.read_csv(filename,encoding='ISO-8859-1', index_col = 0)
df.head()

Check the number of rows and column from the dataset

In [None]:
print(df.shape)

The not a number variable can be used to class identifier, and the number type variable can be used to determine which class the track is in.
The variable used to determine which class the track in this analysis is the Genre variable.

In [None]:
print(df.dtypes)

Changing column name to more easy typed name

In [None]:
df.rename(columns={'Track.Name':'track_name','Artist.Name':'artist_name','Genre':'genre','Beats.Per.Minute':'bpm','Energy':'energy','Danceability':'danceability','Loudness..dB..':'loudness','Liveness':'liveness','Valence.':'valence','Length.':'length', 'Acousticness..':'acousticness','Speechiness.':'speechiness','Popularity':'popularity'},inplace=True)
df.head()

Check Empty Record
If there is empty record from the data entries within the dataset, that empty record will filled with 0 value to avoid unnecessary error in further analysis.

In [None]:
df.isnull().sum()
df.fillna(0)

# Arithmetic analysis from the number type variable

In [None]:
pd.set_option('precision', 2)
df.describe()

# Data Visualisation

Viewing the most popular Genre based on the song / track count from the dataset

In [None]:
fig = plt.figure(figsize = (16,9))
df.groupby('genre')['track_name'].agg(len).sort_values(ascending = False).plot(kind = 'bar')
plt.xlabel('Genre', fontsize = 20)
plt.ylabel('Count of songs', fontsize = 20)
plt.title('Genre vs Songs', fontsize = 30)

Viewing the most popular Artist based on the song / track count from the dataset

In [None]:
fig = plt.figure(figsize = (16,9))
df.groupby('artist_name')['track_name'].agg(len).sort_values(ascending = False).plot(kind = 'bar')
plt.xlabel('Artist Name', fontsize = 20)
plt.ylabel('Count of songs', fontsize = 20)
plt.title('Artist vs Songs', fontsize = 30)

# Correlation among the variable

In [None]:
plt.figure(figsize=(10,10))
plt.title('Correlation between variables')
sns.heatmap(df.corr(),linewidth=3.1,annot=True,center=1)

In [None]:
sns.pairplot(df)

Correlation between the loudness and the energy

In [None]:
sns.set_style("whitegrid")
intensity = sum(df.energy)/len(df.energy)
df['energy_level'] = ['energized' if i > intensity else 'without energy' for i in df.energy]

sns.relplot(x='loudness', y='energy',data=df, kind='line', style='energy_level', hue='energy_level', markers=True, dashes=False, ci='sd')
plt.xlabel('Loudness (dB)', fontsize = 20)
plt.ylabel('Energy', fontsize = 20)
plt.title('Connection between the Loudness (dB) and Energy', fontsize = 25)
# from the plot the appropriate interpretation is loudness and energy were signifanlly correlated, because the more Loud the song the more Energetic the song.

In [None]:
sns.catplot(x='loudness', y='energy',data=df, kind='point', hue='energy_level')
plt.xlabel('Loudness (dB)', fontsize = 20)
plt.ylabel('Energy', fontsize = 20)
plt.title('Connection between the Loudness (dB) and Energy', fontsize = 25)

# Regression between variables

* Linear Regression (Assuming the number variable affect the popularity point of the song)

In [None]:
# trainning dataset
independent_var = df[['bpm','energy','danceability','loudness','liveness','valence','length','acousticness','speechiness']]
dependent_var = df['popularity']

In [None]:
result = linear_model.LinearRegression()
result.fit(independent_var, dependent_var)

intercept = result.intercept_
reg_coef = result.coef_
print('Label: bpm(x1), energy(x2), danceability(x3), loudness(x4), liveness(x5), valence(x6), length(x7), acousticness(x8), speechiness(x9)')
print('\nIntercept value (a): %0.3f' % intercept)
print('\nRegression Equation: ŷ = %0.3f + %0.3f*X1 + %0.3f*X2 + %0.3f*X3 + %0.3f*X4, + %0.3f*X5, + %0.3f*X6, + %0.3f*X7, + %0.3f*X8, + %0.3f*X9' % (intercept, reg_coef[0], reg_coef[1], reg_coef[2], reg_coef[3], reg_coef[4], reg_coef[5], reg_coef[6], reg_coef[7], reg_coef[8]))

* Linear Regression using OLS Regression from stastsmodels

In [None]:
x_var = sm.add_constant(independent_var)
model = sm.OLS(dependent_var, x_var).fit()
predictions = model.predict(x_var)
print(model.summary())

From the result above the **Coefficient Determination** or **R-squared** is **0.198** or **19.8%** and from the p-value column we can see that the all value are higher than 0.05 which is the variables are not having correlation with the **Popularity**. 
With that we can conclude that the **Song Popularity** point is not affected by the independent variable, because the given variables only affect **19.8%** of the **Popularity** point and the **80.4%** of the other significant factors remain unknown.

# Class Classification using KNN

* Preparation

In [None]:
X = df[['bpm','energy','danceability','loudness','liveness','valence','length','acousticness','speechiness']]
y = df['genre']
knn = KNeighborsClassifier(n_neighbors=6)

In [None]:
knn.fit(X, y)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
predict = knn.predict(X)
pd.Series(predict).value_counts()

* Testing the classification model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)
knn = KNeighborsClassifier(n_neighbors=20)

In [None]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

In [None]:
knn.score(X_test, y_test)

# Conclusion
From the score above we know that the classification algorithm accuracy is around **13,3%** with the nearest neighbors of **20**, which mean the classification with the given data from the dataset can't give strong prediction from the new data entered in the future.
The conclusion of the result is the same as the conclusion of the regression test before, because there are no correlation between the independent variable or data with the targeted class name (Genres).

# Thank you!