In [19]:
import sklearn
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from scipy.stats import multivariate_normal # MVN not univariate

file_path = "data.csv"
df = pd.read_csv(file_path, index_col='year')

df.drop(['artists','id', 'name', 'release_date' ], axis = 1, inplace = True)

### Only look at decades from 50s to 10s (2020 not included)
l_drop = np.arange(1921,1950)
l_drop = np.append(l_drop,2020)
df.drop(labels=l_drop, axis=0, inplace = True)

enc = LabelEncoder()
labels = df.index
standardized_labels = np.array(labels)
enc.fit(df.index.unique())



lmao = df.index
y = enc.transform(standardized_labels)
Y = enc.transform(np.unique(standardized_labels))

y_decade = y//10
Y_decade = np.unique(y_decade)

enc.fit(df['explicit'].unique())
df['explicit'] = enc.transform(df['explicit'])

df.set_index(y_decade, inplace=True)
aa = df.index.value_counts().sort_index().to_numpy()
priors = aa/len(df.index)
check = np.sum(priors)
class_priors = np.diag(priors)
num_classes = len(Y_decade)
N = len(df)
print(y_decade)
print(aa)
print(df.index.value_counts())

[0 0 0 ... 6 6 6]
[19950 20000 20000 20000 20000 20000 19900]
5    20000
4    20000
3    20000
2    20000
1    20000
0    19950
6    19900
dtype: int64


In [20]:
# for l in range(num_classes):
#     print(X[y_decade == l])


In [21]:
def regularized_cov(X, lambda_reg):
    n = X.shape[0]
    sigma = np.cov(X)
    # Selecting the regularization parameter should be performed using CV and a separate data subset
    # As I only went by training set performance (overfitting) in this problem, I settled on lambda=1/n
    sigma += lambda_reg * np.eye(n)
    return sigma

covariance = df.std()
mean = df.mean()
X = (df-df.mean())/df.std()

mu = X.groupby([X.index]).mean().to_numpy()
n = mu.shape[1]
Sigma = np.array([regularized_cov(X[y_decade == l].T,(1/n)) for l in range(num_classes)])
# Sigma = np.array([np.cov(X[y_decade == l].T) for l in range(num_classes)])
print(mu)
print(Sigma)


[[ 1.18310513e+00 -3.76180175e-01 -1.33658892e-01 -9.41176488e-01
  -2.70298539e-01  4.24471697e-01 -4.70573783e-02  3.02265961e-02
  -7.66491659e-01  3.68776328e-03 -1.49315785e+00  1.71383679e-01
  -2.58376021e-01 -2.18169919e-01]
 [ 5.65145585e-01 -2.63535735e-01 -2.27088451e-01 -4.48153314e-01
  -2.99322830e-01  1.04392675e-01 -2.57446803e-02  2.86152410e-02
  -3.84891301e-01  9.40046411e-02 -6.32968851e-01 -1.86726689e-01
  -9.80689451e-02  8.80361734e-02]
 [-8.09641263e-02 -8.94161788e-02  1.64464576e-01  2.18387635e-02
  -2.89915282e-01 -4.37424021e-02 -2.31879553e-02  9.32223570e-02
  -1.34354301e-01  6.90618842e-02 -1.16268506e-01 -1.69335844e-01
   5.50425254e-02  2.06296778e-01]
 [-3.80376582e-01  4.13842440e-02  1.41156736e-01  2.66625399e-01
  -2.13388494e-01 -2.65793433e-02  2.94521701e-02  5.84384594e-03
  -8.62619839e-02 -2.48077289e-04 -9.19522988e-03 -1.48046678e-01
   9.98656911e-02  1.38279104e-01]
 [-3.63352415e-01  1.68693391e-01  1.15084469e-01  2.36338582e-01
  

In [25]:
C = len(priors)
class_cond_likelihoods = np.array([multivariate_normal.pdf(X, mu[j], Sigma[j]) for j in range(C)])
print(np.max(class_cond_likelihoods))

# Class Posterior
# P(yj | x) = p(x | yj) * P(yj) / p(x)
class_posteriors = class_priors.dot(class_cond_likelihoods)

decisions = np.argmax(class_posteriors, axis=0)

sample_class_counts = np.array([sum(y == j) for j in Y_decade])


conf_mat = np.zeros((C, C))
display_mat = np.zeros((C,C))
for i in range(C): # Each decision option
    for j in range(C): # Each class label
        ind_ij = np.argwhere((decisions==Y_decade[i]) & (y_decade==Y_decade[j]))
        display_mat[i, j] = len(ind_ij) # Average over class sample count
        conf_mat[i, j] = len(ind_ij)/sample_class_counts[j]

print("Confusion matrix:")
print(display_mat.astype(int))
print(np.sum(display_mat))

correct_class_samples = np.sum(np.diag(display_mat))
print("Total Mumber of Misclassified Samples: {}".format(N - correct_class_samples))

prob_error = 1 - (correct_class_samples / N)
print("Empirically Estimated Probability of Error: {:.4f}".format(prob_error))

2.7054717415764655e-05
Confusion matrix:
[[12383  2433   329   282   183    85    44]
 [ 6657 11813  5400  3217  2427  1462   522]
 [  752  4967 11571  9669  8017  4782  1465]
 [   46   317  1459  4548  2876  1352   305]
 [   61    26   127   435  1063   356   108]
 [   24   199   617  1266  3447  8030  5187]
 [   27   245   497   583  1987  3933 12269]]
139850.0
Total Mumber of Misclassified Samples: 78173.0
Empirically Estimated Probability of Error: 0.5590


In [24]:
print(N)

139850


In [None]:
means = df.groupby(['year']).mean().to_numpy() 
print(means)

In [None]:

# covariance = df.std()
# means = df.mean()



# X = (df-df.mean())/df.std()
# # print(X)
# indexer = df.index.values
# # print(df.head())



# columns = df.columns

# mu = []

# labels = set()
# for ax in df.index:
#     labels.add(ax)
# # print(labels)


    
# for index in indexer:
#     mu.append([np.mean(X[feature][index]) for feature in df])
# mu = np.array(mu)
# n = mu.shape[1]
# Sigma = []
# # print(mu)
# for index in indexer:
#     Sigma.append(np.cov([X[feature][index] for feature in df]))
# print(df.head())