In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 2: Load your dataset directly from the current directory
df = pd.read_excel('SPRINGS.xlsx')

# Step 3: Select all numeric columns
numeric_cols = df.select_dtypes(include=[np.number])

# Step 4: Compute the correlation matrix
corr_matrix = numeric_cols.corr()

# Step 5: Display the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Step 6: Identify pairs of parameters with correlation above 0.9
threshold = 0.9
corr_pairs = corr_matrix.abs().unstack()
high_corr = corr_pairs[(corr_pairs > threshold) & (corr_pairs < 1)].drop_duplicates()
high_corr_pairs = high_corr.sort_values(ascending=False)

print("Pairs of parameters with correlation above 0.9:")
for index, value in high_corr_pairs.items():
    print(f"{index[0]} and {index[1]}: correlation = {value:.2f}")

# Step 7: Save the correlation matrix to an Excel file
corr_matrix.to_excel('correlation_before_SPRINGS_FG_PCA.xlsx')

print("Correlation matrix saved to 'correlation_before.xlsx'")
# Manually drop the highly correlated parameters
#example: dropping 'Conductivity at 20°C [µS/cm]' and 'Calcium [mg/l Ca]'' for my Srping dataset
cols_to_drop = ['Conductivity at 20°C [µS/cm]', 'Calcium [mg/l Ca]']
numeric_cols = numeric_cols.drop(columns=cols_to_drop)

# Recompute the correlation matrix without the dropped parameters
corr_matrix = numeric_cols.corr()

# Display the new correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix After Dropping Highly Correlated Parameters')
plt.show()

# Save the new correlation matrix to an Excel file
corr_matrix.to_excel('correlation_after_SPRINGS.xlsx')

print("Correlation matrix after dropping parameters saved to 'correlation_after.xlsx'")


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from factor_analyzer import calculate_kmo, calculate_bartlett_sphericity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Use the numeric_cols dataframe from previous code, after dropping the columns

# Handle missing values
numeric_cols = numeric_cols.dropna()

# Perform KMO test
kmo_all, kmo_model = calculate_kmo(numeric_cols)
print(f"KMO Test Statistic: {kmo_model:.4f}")

# Perform Bartlett's Test
chi_square_value, p_value = calculate_bartlett_sphericity(numeric_cols)
print(f"Bartlett's Test: Chi-square {chi_square_value:.4f}, p-value {p_value:.4f}")

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(numeric_cols)

# Run PCA
pca = PCA()
pca.fit(data_scaled)

# Display PCA components with variance
print("\nExplained Variance Ratio of each Principal Component:")
for i, variance in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {variance:.4f}")

# Display eigenvalues
eigenvalues = pca.explained_variance_
print("\nEigenvalues:")
for i, eigenvalue in enumerate(eigenvalues):
    print(f"PC{i+1}: {eigenvalue:.4f}")

# Save eigenvectors, eigenvalues, and explained variance to an Excel file
components_df = pd.DataFrame(pca.components_, columns=numeric_cols.columns)
components_df.index = [f"PC{i+1}" for i in range(len(components_df))]

# Create a DataFrame for eigenvalues and explained variance
eigenvalues_df = pd.DataFrame({
    'Eigenvalue': pca.explained_variance_,
    'Explained Variance Ratio': pca.explained_variance_ratio_
})
eigenvalues_df.index = [f"PC{i+1}" for i in range(len(eigenvalues_df))]

# Write to Excel file with multiple sheets
with pd.ExcelWriter('eigenvectors_eigenvalues_all_SPRINGS_FG_PCA.xlsx') as writer:
    components_df.to_excel(writer, sheet_name='Eigenvectors')
    eigenvalues_df.to_excel(writer, sheet_name='Eigenvalues')

print("\nEigenvectors, eigenvalues, and explained variance saved to 'eigenvectors_eigenvalues_all.xlsx'")


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from factor_analyzer import calculate_kmo, calculate_bartlett_sphericity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Use the numeric_cols dataframe from previous code, after dropping the correlated parameters
# Ensure 'numeric_cols' contains the data with dropped correlated parameters

# Handle missing values
numeric_cols = numeric_cols.dropna()

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(numeric_cols)

# Specify the number of principal components to retain for example, 11 in this case
n_components = 11

# Run PCA with the specified number of components
pca = PCA(n_components=n_components)
pca.fit(data_scaled)

# Display the retained explained variance ratio
print(f"\nExplained Variance Ratio of the first {n_components} Principal Components:")
for i, variance in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {variance:.4f}")

# Display the retained eigenvalues
eigenvalues = pca.explained_variance_
print(f"\nEigenvalues of the first {n_components} Principal Components:")
for i, eigenvalue in enumerate(eigenvalues):
    print(f"PC{i+1}: {eigenvalue:.4f}")

# Save the retained eigenvectors to a DataFrame
components_df = pd.DataFrame(pca.components_, columns=numeric_cols.columns)
components_df.index = [f"PC{i+1}" for i in range(n_components)]

# Create a DataFrame for the retained eigenvalues and explained variance
eigenvalues_df = pd.DataFrame({
    'Eigenvalue': eigenvalues,
    'Explained Variance Ratio': pca.explained_variance_ratio_
})
eigenvalues_df.index = [f"PC{i+1}" for i in range(n_components)]

# Define the function to calculate weights
def calculate_weights_eq12a(pca, n_components):
    """
    Calculates weights using eigenvectors and eigenvalues
    wi = (Σ(sqrt(λj) × eij)) / Σλj
    """
    eigenvalues = pca.explained_variance_[:n_components]
    sum_eigenvalues = np.sum(eigenvalues)
    eigenvectors = pca.components_[:n_components]

    weights = []
    for i in range(len(eigenvectors[0])):
        vec_elements = eigenvectors[:, i]
        weight = np.sum(np.sqrt(eigenvalues) * vec_elements) / sum_eigenvalues
        weights.append(abs(weight))

    weights_df = pd.DataFrame({
        'Parameter': numeric_cols.columns,
        'Weight': weights,
        'Normalized Weight': weights / np.sum(weights)
    }).sort_values('Normalized Weight', ascending=False)

    return weights_df

# Calculate weights using the function
weights_df = calculate_weights_eq12a(pca, n_components)

# Save all outputs to an Excel file with multiple sheets
with pd.ExcelWriter('PCA_results_SPRINGS.xlsx') as writer:
    components_df.to_excel(writer, sheet_name='Eigenvectors')
    eigenvalues_df.to_excel(writer, sheet_name='Eigenvalues')
    weights_df.to_excel(writer, sheet_name='Weights', index=False)

print("\nEigenvectors, eigenvalues, and weights saved to 'PCA_weights_springs.xlsx'")
