# PCA Analysis

In [61]:
import pandas as pd

In [102]:
root_dir = '..\\datasets\\fma\\fma_small_932_features.csv'
df = pd.read_csv(root_dir)
df.head()

Unnamed: 0,autocorelation_00_kurtosis,autocorelation_00_max,autocorelation_00_mean,autocorelation_00_median,autocorelation_00_min,autocorelation_00_skew,autocorelation_00_std,autocorelation_00_sum,chroma_cens_00_kurtosis,chroma_cens_00_max,...,dtempo_changes,onset_count,low_energy_rate,harmonic_to_noise_rate,dynamic_range,swing_ratio,syncopation,roughness,warmth,Genre
0,141.7685,12364.426,0.010876,-2.887901,-1593.9122,170.36818,3.016389,7194.268,1.221379,0.379768,...,6,169,184.0,-14.303785,0.21849,1.109646,0.100849,191.356035,0.701712,Electronic
1,31.6862,2798.3555,0.00215,-0.39869,-1224.2454,88.129974,0.790848,1422.519,-0.882889,0.659657,...,5,120,364.0,1.258749,0.147046,1.139929,0.312308,57.557036,0.605086,Electronic
2,11.78795,65289.016,0.065771,-5.728954,-35123.434,4982.0063,0.158547,43475.65,-0.937324,0.564553,...,0,205,278.0,0.19759,0.661439,1.154552,0.112588,1374.684072,0.611777,Electronic
3,4.609666,60351.85,1.028323,-2.132175,-29979.537,5215.5747,0.071497,679714.4,-0.49561,0.188383,...,11,211,191.0,0.258297,0.50459,1.079204,0.06999,2111.869221,0.401475,Electronic
4,113.69997,22080.594,74.39601,34.28389,-5716.523,437.94373,4.025559,49174990.0,-0.863015,0.472208,...,9,205,88.0,0.470797,0.28195,1.155421,0.107491,284.493888,0.461459,Electronic


## Data Preprocessing

In [103]:
df.groupby(["Genre"])['dtempo_00_kurtosis'].apply(lambda x:pd.isna(x).sum())

Genre
Electronic       220
Experimental      49
Folk              39
Hip-Hop          117
Instrumental      79
International     92
Pop              110
Rock              68
Name: dtempo_00_kurtosis, dtype: int64

In [104]:
def impute_by_genre(df, strategy='mean'):
    grouped = df.groupby('Genre')
    
    if strategy == 'mean':
        return grouped.apply(lambda group: group.fillna(group.mean())).reset_index()
    elif strategy == 'median':
        return grouped.apply(lambda group: group.fillna(group.median())).reset_index()
    else:
        raise ValueError("Unsupported strategy. Use 'mean' or 'median'.")

df = impute_by_genre(df)

In [106]:
df = df.drop("level_1", axis=1)
df.head()

Unnamed: 0,Genre,autocorelation_00_kurtosis,autocorelation_00_max,autocorelation_00_mean,autocorelation_00_median,autocorelation_00_min,autocorelation_00_skew,autocorelation_00_std,autocorelation_00_sum,chroma_cens_00_kurtosis,...,beat_count,dtempo_changes,onset_count,low_energy_rate,harmonic_to_noise_rate,dynamic_range,swing_ratio,syncopation,roughness,warmth
0,Electronic,141.7685,12364.426,0.010876,-2.887901,-1593.9122,170.36818,3.016389,7194.268,1.221379,...,64,6,169,184.0,-14.303785,0.21849,1.109646,0.100849,191.356035,0.701712
1,Electronic,31.6862,2798.3555,0.00215,-0.39869,-1224.2454,88.129974,0.790848,1422.519,-0.882889,...,41,5,120,364.0,1.258749,0.147046,1.139929,0.312308,57.557036,0.605086
2,Electronic,11.78795,65289.016,0.065771,-5.728954,-35123.434,4982.0063,0.158547,43475.65,-0.937324,...,66,0,205,278.0,0.19759,0.661439,1.154552,0.112588,1374.684072,0.611777
3,Electronic,4.609666,60351.85,1.028323,-2.132175,-29979.537,5215.5747,0.071497,679714.4,-0.49561,...,94,11,211,191.0,0.258297,0.50459,1.079204,0.06999,2111.869221,0.401475
4,Electronic,113.69997,22080.594,74.39601,34.28389,-5716.523,437.94373,4.025559,49174990.0,-0.863015,...,72,9,205,88.0,0.470797,0.28195,1.155421,0.107491,284.493888,0.461459


## Scaling

In [107]:
from sklearn.preprocessing import StandardScaler

X = df.drop("Genre", axis=1)
y = df["Genre"]

In [108]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [109]:
from sklearn.decomposition import PCA

pca = PCA(.95)

In [110]:
pca.fit(X_train)

In [111]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)