In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.stats import gmean

# ---------------------------
# 1. Load and Prepare Data
# ---------------------------
file_path = "D:/RobertSichomba/HomeWork2/data/hw2.csv"

# Load the CSV file with error handling
try:
    data = pd.read_csv(file_path, encoding="utf-8")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding="ISO-8859-1")

# Check for missing values
data.dropna(inplace=True)  # Drop rows with missing values

# Define element columns (Ensure exact column names match the dataset)
all_elements = [
    'Al (ppb)', 'As (ppb)', 'Ba (ppb)', 'Ca (ppb)', 'Cl (ppb)', 'Cr (ppb)', 
    'Cu (ppb)', 'F (ppb)', 'Fe (ppb)', 'K (ppb)', 'Mg (ppb)', 'Mn (ppb)', 
    'Mo (ppb)', 'Ni (ppb)', 'NO3 (ppb)', 'Pb (ppb)', 'Se (ppb)', 'SO4 (ppb)', 
    'Sr (ppb)', 'U (ppb)', 'V (ppb)', 'Zn (ppb)'
]
subset_elements = ['Cr (ppb)', 'U (ppb)', 'SO4 (ppb)']

# Ensure only columns that exist in the dataset are selected
all_elements = [col for col in all_elements if col in data.columns]
subset_elements = [col for col in subset_elements if col in data.columns]

# Display dataset info
print("Dataset loaded successfully!")
print(f"Columns in dataset: {list(data.columns)}")
print(f"Using element columns: {all_elements}")
print(f"Subset elements: {subset_elements}")



Dataset loaded successfully!
Columns in dataset: ['Latitude', 'Longitude', 'SOURCE', 'SOURCE.NAM', 'OTHER.NAME', 'WELL.ID', 'Al (ppb)', 'As (ppb)', 'Ba (ppb)', 'Ca (ppb)', 'Cl (ppb)', 'Cr (ppb)', 'Cu (ppb)', 'F (ppb)', 'Fe (ppb)', 'K (ppb)', 'Mg (ppb)', 'Mn (ppb)', 'Mo (ppb)', 'Ni (ppb)', 'NO3 (ppb)', 'Pb (ppb)', 'Se (ppb)', 'SO4 (ppb)', 'Sr (ppb)', 'U (ppb)', 'V (ppb) ', 'Zn (ppb)', 'Cr_level']
Using element columns: ['Al (ppb)', 'As (ppb)', 'Ba (ppb)', 'Ca (ppb)', 'Cl (ppb)', 'Cr (ppb)', 'Cu (ppb)', 'F (ppb)', 'Fe (ppb)', 'K (ppb)', 'Mg (ppb)', 'Mn (ppb)', 'Mo (ppb)', 'Ni (ppb)', 'NO3 (ppb)', 'Pb (ppb)', 'Se (ppb)', 'SO4 (ppb)', 'Sr (ppb)', 'U (ppb)', 'Zn (ppb)']
Subset elements: ['Cr (ppb)', 'U (ppb)', 'SO4 (ppb)']


In [2]:
# ---------------------------
# 2. CLR Transformation
# ---------------------------
def clr_transform(df, cols):
    """Apply Centered Log-Ratio (CLR) transformation to compositional data."""
    df_transformed = df[cols] + 1e-9  # Add pseudocount to avoid log(0)
    gm = gmean(df_transformed, axis=1)
    clr_data = np.log(df_transformed.div(gm, axis=0))
    return clr_data

data_clr_all = clr_transform(data, all_elements)
data_clr_subset = clr_transform(data, subset_elements)

# ---------------------------
# 3. K-Means Clustering
# ---------------------------
def perform_kmeans(data, n_clusters=3):
    """Perform K-means clustering on standardized data."""
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    return kmeans.fit_predict(scaled_data)

# Cluster labels for all scenarios
labels_raw_all = perform_kmeans(data[all_elements])
labels_raw_subset = perform_kmeans(data[subset_elements])
labels_clr_all = perform_kmeans(data_clr_all)
labels_clr_subset = perform_kmeans(data_clr_subset)



In [3]:
# Perform k-means clustering on raw data (all elements)
cluster_num = 3
kmeans = KMeans(n_clusters=cluster_num, random_state=42)
kmeans.fit(data[elements_all])  # Using all elements
data['cluster_label'] = kmeans.labels_

# Get the cluster centers
cluster_centers = kmeans.cluster_centers_


NameError: name 'elements_all' is not defined