In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.manifold import MDS

from scipy.spatial.distance import pdist, squareform


In [6]:
sns.set_style('whitegrid')
sns.set_palette('pastel')
palette = sns.color_palette("Greens", n_colors=8)
plt.rcParams['figure.figsize'] = (6,4)
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=[palette[3]])

In [7]:
train_ohs = pd.read_csv(Path('..','adults_data','adults_ohs_train.csv'))
test_ohs = pd.read_csv(Path('..','adults_data','adults_ohs_test.csv'))

train_ohn = pd.read_csv(Path('..','adults_data','adults_ohn_train.csv'))
test_ohn = pd.read_csv(Path('..','adults_data','adults_ohn_test.csv'))

In [10]:
distance_matrix = pdist(train_ohs, metric='euclidean')
distance_matrix_square = squareform(distance_matrix)

In [12]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
embedding = mds.fit_transform(distance_matrix_square)

print(embedding)


In [None]:
def calculate_stress(dissimilarities, max_dims=10):
    stresses = []
    for dim in range(1, max_dims + 1):
        mds = MDS(n_components=dim, dissimilarity="precomputed", random_state=42)
        mds.fit_transform(dissimilarities)
        stresses.append(mds.stress_)
    return stresses

# Oblicz stres dla różnych liczby wymiarów
stresses = calculate_stress(distance_matrix_square)

# Wyświetl wyniki
for dim, stress in enumerate(stresses, start=1):
    print(f"Wymiary: {dim}, Stres: {stress:.4f}")

In [None]:
# Wizualizacja stresu
plt.plot(range(1, len(stresses) + 1), stresses, marker='o')
plt.xlabel('Liczba wymiarów')
plt.ylabel('Stres')
plt.title('Stres w funkcji liczby wymiarów')
plt.grid(True)
plt.show()


In [None]:
label_fontsize = 18
tick_fontsize = 13


plt.figure(figsize=(10, 6))
x_axis = range(1, len(cumulative_variance) + 1)
plt.plot(x_axis, cumulative_variance, marker='o', linestyle='--', color=palette[5], label='Cumulative Explained Variance')
plt.plot(x_axis, [0.9] * len(x_axis), color='red', linestyle='--', label='90% Explained Variance')
plt.plot(x_axis, [0.95] * len(x_axis), color='blue', linestyle='--', label='95% Explained Variance')
#plt.title('Cumulative Explained Variance by Principal Components')
plt.xlabel('Number of Principal Components', 
            fontsize=label_fontsize)
plt.ylabel('Cumulative Explained Variance', 
            fontsize=label_fontsize)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.tick_params(axis='both', labelsize=tick_fontsize)
plt.savefig(Path('..','plots',f'PCA_lines_{standard_type}.pdf'), 
            format='pdf', bbox_inches='tight')
plt.show()

k_90 = next(i for (i, var) in enumerate(cumulative_variance) if var >= 0.90) + 1
k_95 = next(i for (i, var) in enumerate(cumulative_variance) if var >= 0.95) + 1

print('90% '+str(k_90), '95% '+str(k_95))

optimal_PCA = PCA(n_components=k_95)
reduced_train = pd.DataFrame(optimal_PCA.fit_transform(train))  
reduced_train.to_csv(Path('..','adults_data',
                            f'PCA_{standard_type}_train.csv'))
reduced_test = optimal_PCA.transform(test)
reduced_test_df = pd.DataFrame(reduced_test)
reduced_test_df.to_csv(Path('..','adults_data',
                            f'PCA_{standard_type}_test.csv'))

explained_variance_test = np.var(reduced_test, axis=0) / np.sum(np.var(test, axis=0))
explained_variance_test_ratio = np.cumsum(explained_variance_test)
print(f'Explained variance of a test set is {explained_variance_test_ratio[k_90-1]}.')

plt.figure(figsize=(10, 6))
plt.bar(range(1, k_95 + 1), explained_variance[:k_95], color=palette[2], alpha=0.7, label='Explained Variance')
plt.step(range(1, k_95 + 1), cumulative_variance[:k_95], where='mid', color=palette[5], linestyle='--', label='Cumulative Explained Variance')
#plt.title('Explained Variance by Principal Components')
plt.xlabel('Principal Component', 
            fontsize=label_fontsize)
plt.ylabel('Explained Variance Ratio', 
            fontsize=label_fontsize)
plt.grid(True)
plt.legend()
plt.tick_params(axis='both', labelsize=tick_fontsize)
plt.tight_layout()
plt.savefig(Path('..','plots',f'PCA_steps_{standard_type}.pdf'), 
            format='pdf', bbox_inches='tight')
plt.show()