# PCA PLAYGROUND

In [7]:
import pandas as pd
df_annot = pd.read_csv('/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/tables/annot_new.csv')

In [8]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from plotly.express.colors import sample_colorscale
from sklearn.preprocessing import minmax_scale



In [9]:
GENERIC_FEATURES = ['min_t', 'max_t', 'min_f', 'max_f',
                    'dt', 'df', 'min_y', 'min_x', 'max_y', 'max_x', 'centroid_x', 'duration_x', 'bandwidth_y', 'area_xy', 'centroid_f',
                    'centroid_t', 'duration_t', 'bandwidth_f', 'area_tf']

SHAPE_FEATURES = ['shp_002', 'shp_003', 'shp_004', 'shp_005', 'shp_006', 'shp_007',
                  'shp_008', 'shp_009', 'shp_010', 'shp_011', 'shp_012', 'shp_013',
                  'shp_014', 'shp_015', 'shp_016', 'shp_017', 'shp_018', 'shp_019',
                  'shp_020', 'shp_021', 'shp_022', 'shp_023', 'shp_024', 'shp_025',
                  'shp_026', 'shp_027', 'shp_028', 'shp_029', 'shp_030', 'shp_031',
                  'shp_032', 'shp_033', 'shp_034', 'shp_035', 'shp_036', 'shp_037',
                  'shp_038', 'shp_039', 'shp_040', 'shp_041', 'shp_042', 'shp_043',
                  'shp_044', 'shp_045', 'shp_046', 'shp_047', 'shp_048']


SPECTRAL_FEATURES = ['MEANf', 'VARf', 'SKEWf', 'KURTf', 'NBPEAKS', 'LEQf',
                     'ENRf', 'BGNf', 'SNRf', 'Hf', 'EAS', 'ECU', 'ECV', 'EPS', 'EPS_KURT', 'EPS_SKEW', 'ACI',
                     'NDSI', 'ROU']

TEMPORAL_FEATURES = ['ZCR', 'MEANt', 'VARt',
                     'SKEWt', 'KURTt', 'BGNt', 'SNRt', 'MED', 'Ht']



In [11]:
df['sound_id']

0      449420
1      449420
2      449420
3      449420
4      449420
        ...  
949    705357
950    705357
951    457592
952    444840
953     59739
Name: sound_id, Length: 954, dtype: int64

In [4]:
features_options='basic'
dimensions=2
color="species"

df = df_annot.copy()
df = df.reset_index()

if features_options == 'basic':
    features = GENERIC_FEATURES
if features_options == 'shapes':
    features = SHAPE_FEATURES+GENERIC_FEATURES
if features_options == 'spectral':
    features = SPECTRAL_FEATURES+GENERIC_FEATURES
if features_options == 'temporal':
    features = TEMPORAL_FEATURES+GENERIC_FEATURES
if features_options == 'all':
    features = GENERIC_FEATURES+SPECTRAL_FEATURES+TEMPORAL_FEATURES+SHAPE_FEATURES

df = df.dropna(how='all')
X = df.loc[:, features]

# normalize data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
# compute PCA, explained variance and weights
pca = PCA()
components = pca.fit_transform(X)
total_var = pca.explained_variance_ratio_.sum() * 100

indexPC = ['PC - '+str(i) for i in range(dimensions)]
df_feature_weights = pd.DataFrame(pca.components_)

# figure-----------------------
labels = {
    str(i): f"PC {i+1} ({var:.1f}%) -- sound_id = {df.loc[:,'']}"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

colors_ = np.linspace(0, 1, len(df[color].unique()))
discrete_colors = sample_colorscale('Rainbow', minmax_scale(colors_))

fig = px.scatter_matrix(
    components,
    height=1000,
    labels=labels,
    dimensions=range(dimensions),
    color=df[color],
    color_discrete_sequence=discrete_colors,
    title=f'Principal component analysis of ROIs ---Total Explained Variance: {total_var:.2f}%',
)
fig.update_traces(diagonal_visible=False)

fig2 = px.bar(df_feature_weights, title='Weights of features for each PC')




  dims = [


In [5]:
labels

{'0': 'PC 1 (32.2%)',
 '1': 'PC 2 (29.9%)',
 '2': 'PC 3 (18.3%)',
 '3': 'PC 4 (10.1%)',
 '4': 'PC 5 (5.5%)',
 '5': 'PC 6 (2.2%)',
 '6': 'PC 7 (1.1%)',
 '7': 'PC 8 (0.5%)',
 '8': 'PC 9 (0.2%)',
 '9': 'PC 10 (0.0%)',
 '10': 'PC 11 (0.0%)',
 '11': 'PC 12 (0.0%)',
 '12': 'PC 13 (0.0%)',
 '13': 'PC 14 (0.0%)',
 '14': 'PC 15 (0.0%)',
 '15': 'PC 16 (0.0%)',
 '16': 'PC 17 (0.0%)',
 '17': 'PC 18 (0.0%)',
 '18': 'PC 19 (0.0%)'}

In [None]:
from sklearn import preprocessing

data_scaled = pd.DataFrame(preprocessing.scale(df_annot[features]),columns = df_annot[features].columns) 

# PCA
pca = PCA(n_components=3)
pca.fit_transform(data_scaled)

# Dump components relations with features:
weights = (pd.DataFrame(abs(pca.components_),columns=data_scaled.columns,index = ['PC-1','PC-2','PC-3']))


In [None]:
test = weights.reset_index()

In [None]:
test

In [None]:
px.bar(test,x=features, facet_col='index', barmode='group')

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(cols=1, rows=dimensions,subplot_titles=([f'PC-{i+1}: abs weight of features' for i in range(dimensions)]))

for i in range(dimensions):
    fig.append_trace(go.Bar(x=features ,y = test[features].loc[i,:],text=features,textposition="inside")
    , col=1, row=i+1)
    fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
    fig.update_xaxes({'categoryorder':'total descending'})
    fig.update_xaxes(visible=False)


fig.update_layout(height=800, width=1000, showlegend=False)
fig.show()

In [None]:
for i in range(dimensions):
    print(i)

In [None]:
abs(weights).reset_index().groupby('index').max()

In [None]:
weights = weights.transpose()

In [None]:
abs(weights)

In [None]:
px.bar(weights, barmode='group')

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100


In [None]:
total_var

In [None]:
weights['index']

In [None]:
weights = weights.reset_index()
weights['PC'] = weights['index']

In [None]:
weights[features] = abs(weights[features])

In [None]:
fig2 = px.bar(weights, y=features,x=px.Constant('Relative features'), facet_col = 'index', barmode='group',title='Weights of features for each PC')

In [None]:
fig2.update_yaxes(type='category')


In [None]:
fig2.show()

In [None]:
weights

In [None]:
fig3 = px.bar(weights[features].loc[2,:].sort_values())
fig3.update_layout(title = 'PC-1')


In [None]:
fig3.show()