In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# ----------------------------------------------------------  
# TITLE: Advanced NRW Flood Warning Areas ML Analysis (Steps 1-10)
# ----------------------------------------------------------

# Step 1: Load dataset from CSV file
print("=== Step 1: Loading Dataset for ML Analysis ===")
print("This step loads the NRW flood warning CSV file as basis for machine learning tasks")
df = pd.read_csv(r"F:/Data Science/Data Set/Enviro_Data/NRW_FLOOD_WARNING.csv")
print(f"✅ Dataset loaded successfully: {len(df):,} records imported\n")

# Step 2: ML-focused feature selection and basic cleaning
print("=== Step 2: Feature Selection & Cleaning ===")
print("This step selects features suitable for ML and handles basic missing values")

# Example feature set: adjust to your schema
# Categorical: region, area, river_sea, parent
# Numerical: example assumed columns 'lat', 'lon' (replace/add your own numeric risk factors)
candidate_cols = ['region', 'area', 'river_sea', 'parent', 'lat', 'lon']
available_cols = [c for c in candidate_cols if c in df.columns]

df_ml = df[available_cols].copy()
print(f"Using the following features for ML: {available_cols}")

# Simple missing value handling
for col in df_ml.columns:
    if df_ml[col].dtype == 'object':
        df_ml[col] = df_ml[col].fillna("Missing")
    else:
        df_ml[col] = df_ml[col].fillna(df_ml[col].median())

print("✅ Basic cleaning complete (missing values handled)\n")

# Step 3: Define preprocessing pipeline (encoding + scaling)
print("=== Step 3: Preprocessing Pipeline ===")
print("This step builds a reusable pipeline for encoding categoricals and scaling numerics")

categorical_cols = [c for c in df_ml.columns if df_ml[c].dtype == 'object']
numeric_cols = [c for c in df_ml.columns if c not in categorical_cols]

print(f"Categorical features: {categorical_cols}")
print(f"Numeric features: {numeric_cols}")

categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_cols),
        ("num", numeric_transformer, numeric_cols),
    ]
)

print("✅ Preprocessing pipeline defined (OneHotEncoder + StandardScaler)\n")

# Step 4: Unsupervised clustering to identify FWA typologies
print("=== Step 4: K-Means Clustering of FWAs ===")
print("This step clusters FWAs into groups with similar geographic and hydrological characteristics")

X = df_ml.copy()

# Number of clusters is a modelling choice; start with 4–8 and tune
n_clusters = 6
kmeans_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clusterer", KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")),
])

kmeans_pipeline.fit(X)
cluster_labels = kmeans_pipeline["clusterer"].labels_
df["cluster_kmeans"] = cluster_labels

print(f"✅ K-Means clustering complete with {n_clusters} clusters assigned to each FWA")
print("Cluster label distribution:")
print(df["cluster_kmeans"].value_counts().sort_index())
print()

# Step 5: Cluster quality assessment using silhouette score
print("=== Step 5: Cluster Quality Assessment ===")
print("This step evaluates clustering quality using silhouette score (higher is better)")

# Transform X once to compute silhouette score in feature space
X_transformed = kmeans_pipeline["preprocessor"].transform(X)
silhouette = silhouette_score(X_transformed, cluster_labels)
print(f"Silhouette score for K-Means with {n_clusters} clusters: {silhouette:.3f}\n")

# Step 6: Cluster-level profiling (region and river/sea composition)
print("=== Step 6: Cluster Profiling ===")
print("This step profiles clusters by dominant regions and river/sea types")

cluster_profile_region = (
    df.groupby("cluster_kmeans")["region"]
    .value_counts(normalize=True)
    .rename("proportion")
    .reset_index()
)

cluster_profile_riversea = (
    df.groupby("cluster_kmeans")["river_sea"]
    .value_counts(normalize=True)
    .rename("proportion")
    .reset_index()
)

print("Top 5 region proportions per cluster:\n")
print(cluster_profile_region.groupby("cluster_kmeans").head(5))
print("\nTop 5 river/sea types per cluster:\n")
print(cluster_profile_riversea.groupby("cluster_kmeans").head(5))
print()

# Step 7: Dimensionality reduction with PCA for visualisation-ready data
print("=== Step 7: PCA for 2D Embedding ===")
print("This step reduces feature space to 2D for future plotting / visual analytics")

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_transformed)
df["pca_1"] = X_pca[:, 0]
df["pca_2"] = X_pca[:, 1]

explained_var = pca.explained_variance_ratio_.sum()
print(f"✅ PCA complete. Two components explain {explained_var*100:.1f}% of variance")
print("Columns 'pca_1' and 'pca_2' added to dataframe for plotting\n")

# Step 8: Prototype risk scoring (unsupervised) using cluster properties
print("=== Step 8: Prototype Unsupervised Risk Scoring ===")
print("This step builds a simple relative risk index based on cluster density and spread")

# Example: higher risk for clusters with many FWAs and wide regional spread
cluster_stats = df.groupby("cluster_kmeans").agg({
    "fwa_name": "count",
    "region": "nunique",
    "area": "nunique"
}).rename(columns={
    "fwa_name": "Cluster_Size",
    "region": "Regions_Covered",
    "area": "Areas_Covered"
})

# Normalise metrics and create a composite risk score
cluster_stats_norm = (cluster_stats - cluster_stats.min()) / (cluster_stats.max() - cluster_stats.min() + 1e-9)
cluster_stats["Risk_Score"] = (
    0.5 * cluster_stats_norm["Cluster_Size"] +
    0.3 * cluster_stats_norm["Regions_Covered"] +
    0.2 * cluster_stats_norm["Areas_Covered"]
)

df = df.merge(cluster_stats["Risk_Score"], left_on="cluster_kmeans", right_index=True, how="left")
print("Cluster-level risk scores:\n")
print(cluster_stats[["Cluster_Size", "Regions_Covered", "Areas_Covered", "Risk_Score"]])
print("\nSample of FWAs with assigned cluster and risk score:\n")
print(df[["fwa_name", "region", "area", "cluster_kmeans", "Risk_Score"]].head(15))
print()

# Step 9: Encoding-ready dataset export for downstream modelling
print("=== Step 9: Export ML-Ready Dataset ===")
print("This step exports enriched dataset with clusters, PCA coordinates, and risk score")

output_path = r"F:/Data Science/Data Set/Enviro_Data/NRW_FLOOD_WARNING_ML_READY.csv"
df.to_csv(output_path, index=False)
print(f"✅ ML-ready dataset saved to: {output_path}\n")

# Step 10: Next steps guidance
print("=== Step 10: Next Steps for Advanced Modelling ===")
print("Potential follow-on ML tasks on this enriched dataset:")
print("- Supervised learning: train models to predict high-risk FWAs using labelled historical flood impact data")
print("- Time-series modelling: extend data with temporal flood events, rainfall, and river level series")
print("- Geospatial analytics: integrate GIS layers (DEM, land use, population) and build spatial risk models")
print("- Scenario analysis: simulate changes in risk score under climate and land-use change assumptions")
print("✅ Advanced ML feature engineering complete, dataset ready for downstream risk modelling and visualisation")


=== Step 1: Loading Dataset for ML Analysis ===
✅ Dataset loaded successfully: 353 records imported

=== Step 2: Feature Selection & Cleaning ===
This step selects features suitable for ML and handles basic missing values
Using the following features for ML: ['region', 'area', 'river_sea', 'parent']
✅ Basic cleaning complete (missing values handled)

=== Step 3: Preprocessing Pipeline ===
This step builds a reusable pipeline for encoding categoricals and scaling numerics
Categorical features: ['region', 'area', 'river_sea', 'parent']
Numeric features: []
✅ Preprocessing pipeline defined (OneHotEncoder + StandardScaler)

=== Step 4: K-Means Clustering of FWAs ===
This step clusters FWAs into groups with similar geographic and hydrological characteristics




✅ K-Means clustering complete with 6 clusters assigned to each FWA
Cluster label distribution:
cluster_kmeans
0    143
1     15
2     46
3     78
4     63
5      8
Name: count, dtype: int64

=== Step 5: Cluster Quality Assessment ===
This step evaluates clustering quality using silhouette score (higher is better)
Silhouette score for K-Means with 6 clusters: 0.166

=== Step 6: Cluster Profiling ===
This step profiles clusters by dominant regions and river/sea types
Top 5 region proportions per cluster:

   cluster_kmeans region  proportion
0               0  Wales         1.0
1               1  Wales         1.0
2               2  Wales         1.0
3               3  Wales         1.0
4               4  Wales         1.0
5               5  Wales         1.0

Top 5 river/sea types per cluster:

    cluster_kmeans              river_sea  proportion
0                0              Irish Sea    0.251748
1                0                   Tawe    0.076923
2                0               