In [1]:
# ✅ System & Utility Imports
import os
import shutil
import json
import datetime
import time
import random
import itertools
import argparse
import pickle
import joblib
import fnmatch
import io
import base64

# ✅ Data Handling & Processing
import pandas as pd
import numpy as np
import csv

# ✅ Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Machine Learning & Model Evaluation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.tree import plot_tree
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, average_precision_score, confusion_matrix, classification_report
)
import sklearn.preprocessing as SKP
import sklearn.metrics as SKM

# ✅ Deep Learning (TensorFlow / Keras)
import tensorflow as tf
import keras
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Concatenate
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adagrad
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, Callback
from tensorflow.keras.metrics import Precision, Recall, AUC, CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# ✅ Feature Importance (SHAP)
import shap

# ✅ Geographic Data (if used for spatial analysis)
import geopandas as gpd
from shapely.geometry import Point

# ✅ Logging & Experiment Tracking
import mlflow
import mlflow.tensorflow

# ✅ Optimization & Hyperparameter Tuning
import optuna

# ✅ Math & Statistics
from scipy import stats
import scipy as SCP

# ✅ Progress Bars & Performance Monitoring
from tqdm import tqdm
import psutil
import gc

import optuna
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adagrad
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import Precision, Recall, AUC, CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
import numpy as np


In [None]:
#upload training dataset (output from 3preprocessing - maybe some cleaning will be needed, dependent on core data)
#also we are not using SMOD for training - but in case of potential use in future, I wasn't deleting code related to smod and some other related attributes which were not used in final computation
ML_df = pd.read_parquet(r"all_for_training.parquet") 
ML_df.index = [i for i in range(len(ML_df))]
ML_df

In [3]:
from collections import Counter
import random

In [None]:
ML_df = ML_df[ML_df["L1_5_class"].astype(str).str.strip() != ""]
Counter(ML_df.L1_5_class)

In [5]:
def augment_building(row):
    
    augmented_area = row.area_in_meters * random.uniform(1.001, 1.1)
    
    if row.SMOD_id == 5:
        new_smod = row.SMOD_id + random.randint(0, 1)
    else:
        new_smod = row.SMOD_id
        
    augmented_perimeter = row.building_perimeter_in_meters_new * random.uniform(1.001, 1.1)
    
    perimeter_to_area = augmented_perimeter / augmented_area
    
    normalized_perimeter_to_area_ratio = perimeter_to_area / 6.5
    
    augmented_radius_m = row.radius_m * random.uniform(1.001, 1.1)
    
    augmented_road_density_fixed = row.road_density_fixed * (random.choice([1, row.SMOD_id]) * 1.01)
    augmented_building_density_100 = row.building_density_100 * (random.choice([1, row.SMOD_id]) * 1.01)
    augmented_nearest_city_distance_km = row.nearest_city_distance_km * random.uniform(0.9, 1)
    
    augmented_row = {
        'area_in_meters': augmented_area,
        'SMOD_id': new_smod,
        'building_perimeter_in_meters_new': augmented_perimeter,
        'perimeter_to_area': perimeter_to_area,
        'radius_m': augmented_radius_m,
        'normalized_perimeter_to_area_ratio': normalized_perimeter_to_area_ratio,
        'road_density_fixed': augmented_road_density_fixed,
        'building_density_100': augmented_building_density_100,
        'nearest_city_distance_km': augmented_nearest_city_distance_km,
    }
    
    return augmented_row

def augment_df(df):
    
    df.index = [i for i in range(len(df))]
    for row in df.itertuples():
        
        augmented_row = augment_building(row)
        
        for col_name, value in augmented_row.items():
            
            df.at[row.Index, col_name] = value
            
    return df

In [None]:
# ML_df = pd.concat([ML_df_residential_above_1500, reduced_df, ML_df_residential_above_200, augmented_df, ML_df_nonresidential, ML_df_industrial])
ML_df = ML_df[(ML_df.use_for_training == 'Yes')]
Counter(ML_df.L1_5_class)
ML_df


In [None]:
def assign_label(idx):
#     parts = [0, int(data_len * 0.8), int(data_len * 0.9), data_len]
    
    if str(idx)[-1] in ['0', '1', '2', '6', '7', '8', '9']:
        return 'train'
    elif str(idx)[-1] in ['3', '4']:
        return 'validation'
    elif str(idx)[-1] in ['5']:
        return 'test'

data_len = len(ML_df)
ML_df['index_column'] = [i for i in range(len(ML_df))]
ML_df['image_ML_type'] = ["initval" for _ in range(len(ML_df))]

for ml_class in list(set(ML_df['L1_5_class'])):
    
    
    ML_df = ML_df.sort_values('area_in_meters', ascending=True)
    ml_class_data_idxs = ML_df[ML_df['L1_5_class'] == ml_class].index.tolist()
    for row_idx, df_idx in enumerate(ml_class_data_idxs):
        
        ML_df.at[df_idx, 'image_ML_type'] = assign_label(row_idx)
        
split_result = ML_df[['image_ML_type', 'L1_5_class', 'index_column']].groupby(['image_ML_type', 'L1_5_class']).count()
split_result['split in %'] = round(100 * split_result['index_column'] / data_len, 3)
print(split_result)

In [None]:

ML_df[['image_ML_type', 'L1_5_class']].groupby(['image_ML_type', 'L1_5_class']).agg({'L1_5_class': ['count']})

In [None]:
gML = ML_df[['image_ML_type', 'L1_5_class']].reset_index().groupby(['image_ML_type', 'L1_5_class'])["L1_5_class"].count().reset_index(name="count")
gML = pd.DataFrame(gML)
gML

In [None]:
base_path = os.getcwd()

models_dir = os.path.join(base_path, "models") 
checkpoints_and_metadata = os.path.join(base_path, "model_checkpoints_and_metadata") 
artifact_dir = os.path.join(base_path, "mlruns") 
my_artifacts = os.path.join(base_path, "artifacts")

# Delete old directories (but NOT `mlruns/` since MLflow manages it)
try:
    shutil.rmtree(models_dir)
    shutil.rmtree(checkpoints_and_metadata)
    shutil.rmtree(my_artifacts)

except FileNotFoundError:
    pass  

# Recreate necessary directories
os.makedirs(checkpoints_and_metadata, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)
os.makedirs(my_artifacts, exist_ok=True)



## Creating test train val datasets - or load the previously saved one as the pkl file

In [11]:
get_class_number = {
    'nonresidential': 0,
    'residential': 1,
    'industrial':2
}

In [12]:
normalize_area = 20_000
#normalize_height = 20
#normalize_smod = 6 
normalize_int_t=3300
normalize_int_distance=180
#avg_range_k5_capped=1100
normalize_road_count=30
normalize_road_density=5000
normalize_perim_to_area=7
#normalize_nearest_city=400
normalize_road_distance=10100
normalize_radius=100
#Anormalize_height_mean=13
#normalize_density_100=110
#normalize_density_500=1900
normalize_density_100=200
#normalize_perimeter = 500


#normalize_int_t=4691
#normalize_ranges_k5=2351



In [13]:
# Define dataset storage for numeric features

folders_tree = {
    'train': ['nonresidential', 'residential','industrial'],
    'test': ['nonresidential', 'residential','industrial'],
    'validation': ['nonresidential', 'residential','industrial'],
}

train_numeric, train_labels = [], []
validation_numeric, validation_labels = [], []
test_numeric, test_labels = [], []

# Function to extract only numerical features and label
def create_learning_sample(row):
    output = get_class_number[row.L1_5_class]  # Just return the class index (0, 1, or 2)
    numeric_features = [
        #feature 0
        row.area_in_meters / normalize_area, 
        #feature 1
        #row.SMOD_id / normalize_smod,
        #feature 2
        row.normalized_perimeter_to_area_ratio,
        #feature 3
        row.radius_m/normalize_radius,
        #feature 4
        row.distance_to_1/5000,
        #feature 5
        row.distance_to_2/4000,
        #feature 6
        row.distance_to_3/3000,
        #feature 7
        row.distance_to_4/2000,
        #feature 8
        row.road_density_for_4_fixed/60_000,
        #feature 9
        row.road_density_for_5_fixed/75_000,
        #feature 10
        row.building_density_100/normalize_density_100,
        #feature 11
        #row.building_perimeter_in_meters_new/normalize_perimeter
         # row.internet_towers_nearby_capped/normalize_int_t,
        #row.nearest_internet_tower_distance_km_capped/normalize_int_distance,
        # row.avg_range_k_nearest_with_5/avg_range_k5_capped,
        # row.road_density_fixed/normalize_road_density,
        # row.roads_nearby_fixed/normalize_road_count,
        # row.nearest_city_distance_km/normalize_nearest_city,
        #row.height_mean_cappedNEW/normalize_height_mean,

    ]
    return numeric_features, output


for type_folder, class_folders in folders_tree.items():
        
    for classfolder in class_folders:
        
        class_data = ML_df[(ML_df.image_ML_type == type_folder) & (ML_df.L1_5_class == classfolder)]
        
        for _, row in class_data.iterrows():  
            
            numeric_features, output = create_learning_sample(row)
            
            if type_folder == "train":
                train_numeric.append(numeric_features)
                train_labels.append(output)
                
            elif type_folder == "validation":
                validation_numeric.append(numeric_features)
                validation_labels.append(output)
                
            elif type_folder == "test":
                test_numeric.append(numeric_features)
                test_labels.append(output)
                

# Convert lists to NumPy arrays
train_numeric = np.array(train_numeric)
train_labels = np.array(train_labels)
validation_numeric = np.array(validation_numeric)
validation_labels = np.array(validation_labels)
test_numeric = np.array(test_numeric)
test_labels = np.array(test_labels)

# Convert integer labels (0, 1, 2) to one-hot
train_labels = to_categorical(train_labels, num_classes=3)
validation_labels = to_categorical(validation_labels, num_classes=3)
test_labels = to_categorical(test_labels, num_classes=3)



## Train the model 

In [None]:
#  Ensure experiment exists before setting it
# mlflow ui - in terminal in case it is loading too long or crashing
experiment_name = "Experiment 1"
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except mlflow.exceptions.MlflowException:
    
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id  # If already exists, get it

mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/700709927831304081', creation_time=1753082545835, experiment_id='700709927831304081', last_update_time=1753082545835, lifecycle_stage='active', name='Experiment NN 1 kenya', tags={}>

In [None]:
run_name = f"Run_Numerical_Model_NN_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_AllInputs_LastDatasetVersion_3categories"

class LogLearningCurvesCallback(keras.callbacks.Callback):
        def on_train_end(self, logs=None):
            """Called at the end of training to log learning curves."""
            plt.figure(figsize=(16, 16))
    
            metrics = ["accuracy", "loss", 'prc']
            titles = ["Training and Validation Accuracy", "Training and Validation Loss", "Training and Validation AUC-Prec-Recall"]
            
            for i, metric in enumerate(metrics):
                plt.subplot(3, 1, i + 1)
    
                epochs = range(1, len(self.model.history.history[metric]) + 1)
    
                plt.plot(epochs, self.model.history.history[metric], label=f'Training {metric}')
                plt.plot(epochs, self.model.history.history[f'val_{metric}'], label=f'Validation {metric}')
                
                plt.legend(loc='lower right')
                plt.xlabel("Epoch")
                plt.ylabel(metric.capitalize())
                plt.xticks(epochs)  
                plt.title(titles[i])
    
            learning_curve_path = os.path.join("artifacts", "learning_curves", "learning_curve.png")
            os.makedirs(os.path.dirname(learning_curve_path), exist_ok=True)
    
            plt.savefig(learning_curve_path)
            plt.close()
    
            mlflow.log_artifact(learning_curve_path, artifact_path="learning_curves")

class LogMetricsPerEpochCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}

        for metric_name, metric_value in logs.items():
            if metric_value is not None:
                mlflow.log_metric(metric_name, float(metric_value), step=epoch+1)

            precision_train = logs.get("precision", 0)
            recall_train = logs.get("recall", 0)
            f1_train = (2 * precision_train * recall_train) / (precision_train + recall_train) if (precision_train + recall_train) != 0 else 0
            mlflow.log_metric("f1", f1_train, step=epoch + 1)

            precision_val = logs.get("val_precision", 0)
            recall_val = logs.get("val_recall", 0)
            f1_val = (2 * precision_val * recall_val) / (precision_val + recall_val) if (precision_val + recall_val) != 0 else 0
            mlflow.log_metric("f1_val", f1_val, step=epoch + 1)

        print(f"✅ MLflow Metrics Logged for Epoch {epoch}: {logs.keys()}")  

class MemoryUsageCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        memory_usage = round(psutil.Process(os.getpid()).memory_info().rss / (1024**3), 3)
        print(f"Memory usage on epoch {epoch} start: {memory_usage} GB")
        mlflow.log_metric("memory_usage_start", memory_usage, step=epoch+1)

    def on_epoch_end(self, epoch, logs=None):
        memory_usage = round(psutil.Process(os.getpid()).memory_info().rss / (1024**3), 3)
        print(f"Memory usage on epoch {epoch} end: {memory_usage} GB")
        mlflow.log_metric("memory_usage_end", memory_usage, step=epoch+1)

class ClearMemory(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        collected = gc.collect()
        print(f"Epoch {epoch}: garbage collector collected {collected} objects.")
        mlflow.log_metric("gc_collected_objects", collected, step=epoch+1)
        
# Define Neural Network
def build_model():
    numeric_input = Input(shape=(train_numeric.shape[1], ), name="numeric_input")
    x = Dense(100, activation='relu')(numeric_input)
    x = Dropout(0.2)(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(64,activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(8, activation= 'relu')(x)
    x = Dense(4, activation='relu')(x)
    output = Dense(3, activation='softmax')(x)  

    model = Model(inputs=numeric_input, outputs=output)

    
    model.compile(
        optimizer=Adam(learning_rate=0.001), 
        loss='categorical_crossentropy',  
        metrics=[
            CategoricalAccuracy(name="accuracy"),  
            Precision(name="precision"),  
            Recall(name="recall"),  
            AUC(name="auc"), 
            AUC(name="prc", curve="PR")])  

    return model

# Initialize Model
model = build_model()

with mlflow.start_run(run_name=run_name):
    # Log Model Parameters
    #mlflow.log_params({'num_layers': 3, 'units': 416, 'activation': 'tanh', 'dropout': 0.18377584830628102, 'learning_rate': 0.0013730588254974561, 'optimizer': 'RMSprop', 'batch_size': 32, 'reduce_lr_factor': 0.13072282886518677})

    # Train Model
    # early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

    reduce_learning_rate = ReduceLROnPlateau(monitor='val_accuracy', factor=0.3, patience=1, verbose=1, min_delta=0.0005, min_lr=1e-14)

    
    history = model.fit(
        train_numeric, train_labels, 
        epochs=25, batch_size=32, 
        validation_data=(validation_numeric, validation_labels),
        callbacks=[reduce_learning_rate,
        # early_stopping,
        LogMetricsPerEpochCallback(),
        LogLearningCurvesCallback(),  
        MemoryUsageCallback(),         
        ClearMemory()])                 

    y_train_pred = np.argmax(model.predict(train_numeric), axis=1)
    y_val_pred = np.argmax(model.predict(validation_numeric), axis=1)
    y_train_pred_proba = model.predict(train_numeric)
    y_val_pred_proba = model.predict(validation_numeric)

    # Convert true labels from one-hot to integer format
    train_labels = np.argmax(train_labels, axis=1)  
    validation_labels = np.argmax(validation_labels, axis=1)
    test_labels = np.argmax(test_labels, axis=1)

    metrics = {
        "accuracy": accuracy_score(train_labels, y_train_pred),
        "val_accuracy": accuracy_score(validation_labels, y_val_pred),
        "precision": precision_score(train_labels, y_train_pred, average="macro"),
        "val_precision": precision_score(validation_labels, y_val_pred, average="macro"),
        "recall": recall_score(train_labels, y_train_pred, average="macro"),
        "val_recall": recall_score(validation_labels, y_val_pred, average="macro"),
        "f1": f1_score(train_labels, y_train_pred, average="macro"),
        "f1_val": f1_score(validation_labels, y_val_pred, average="macro"),
        # "auc": roc_auc_score(train_labels, y_train_pred_proba, multi_class="ovo"),
        # "val_auc": roc_auc_score(validation_labels, y_val_pred_proba, multi_class="ovo"),
        # "prc": average_precision_score(train_labels, y_train_pred_proba, average="macro"),
        # "val_prc": average_precision_score(validation_labels, y_val_pred_proba, average="macro"),
        "loss": history.history['loss'][-1],  
        "val_loss": history.history['val_loss'][-1]  
    }

    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value, step=0)

    # Save Model
    model_save_path = os.path.join("models", "neural_network_3cat.h5")
    model.save(model_save_path)
    mlflow.log_artifact(model_save_path, artifact_path="models")

    model_save_path = os.path.join("models", "neural_network_3cat.keras")
    model.save(model_save_path)
    mlflow.log_artifact(model_save_path, artifact_path="models")


    def log_confusion_matrix(y_true, y_pred, filename, title):
        cf_mtx = confusion_matrix(y_true, y_pred)
        class_labels = ['nonresidential', 'residential', 'industrial']

        plt.figure(figsize=(7, 5))
        sns.heatmap(cf_mtx, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
        plt.xlabel("Predicted Label")
        plt.ylabel("True Label")
        plt.title(title)

        cm_path = os.path.join("artifacts", filename)
        os.makedirs(os.path.dirname(cm_path), exist_ok=True)
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path, artifact_path="confusion_matrices")

    log_confusion_matrix(validation_labels, y_val_pred, "nn_3cat_confusion_matrix.png", "Confusion Matrix - NN Model")

    feature_names = ["area_in_meters", "normalized_perimeter_to_area_ratio", "radius_m", "distance_to_1", "distance_to_2", "distance_to_3","distance_to_4", "road_density_for_4_fixed", 'road_density_for_5_fixed', 'building_density_100' ]

    def log_shap_feature_importance(feature_names):
        explainer = shap.Explainer(model, train_numeric)
        shap_values = explainer(train_numeric[:100])  # Sample subset

        shap.summary_plot(
            shap_values, 
            features=validation_numeric[:100], 
            feature_names=feature_names,  # Pass feature names here
            show=False
        )

        feature_importance_path = os.path.join("artifacts", "nn_shap_feature_importance.png")
        os.makedirs(os.path.dirname(feature_importance_path), exist_ok=True)
        plt.savefig(feature_importance_path)
        plt.close()

        mlflow.log_artifact(feature_importance_path, artifact_path="feature_importance")

    # Example usage:
    log_shap_feature_importance(feature_names) 

    # Test Set Predictions
    y_test_pred = np.argmax(model.predict(test_numeric), axis=1)
    y_test_pred_proba = model.predict(test_numeric)

    test_metrics = {
        'test_accuracy': accuracy_score(test_labels, y_test_pred),
        'test_precision': precision_score(test_labels, y_test_pred, average="macro"),
        'test_recall': recall_score(test_labels, y_test_pred, average="macro"),
        'test_f1': f1_score(test_labels, y_test_pred, average="macro"),
        'test_auc': roc_auc_score(test_labels, y_test_pred_proba, multi_class="ovo")
    }

    for metric_name, metric_value in test_metrics.items():
        mlflow.log_metric(metric_name, metric_value)

    log_confusion_matrix(test_labels, y_test_pred, "nn_3cat_confusion_matrix_TEST.png", "Confusion Matrix TEST - NN Model")    

print(model.summary())

print("Training complete. Metrics, model, and confusion matrix logged in MLflow.")


In [None]:
import matplotlib.pyplot as plt
val_loss = history.history["val_loss"]
loss = history.history["loss"]
epochs = range(0,25)
plt.plot(epochs, val_loss, "b--",
         label = "Validation loss")
plt.plot(epochs, loss, "g--",
         label = "Training loss")
plt.title("Validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend



In [18]:
mlflow.end_run()

In [19]:
normalize_area = 20_000
#normalize_height = 20
normalize_smod = 6 
normalize_int_t=3300
normalize_int_distance=180
#avg_range_k5_capped=1100
normalize_road_count=30
normalize_road_density=5000
normalize_perim_to_area=7
#normalize_nearest_city=400
normalize_road_distance=10100
normalize_radius=100
#Anormalize_height_mean=13
#normalize_density_100=110
#normalize_density_500=1900
normalize_density_100=200


In [None]:
#upload dataset with real data for validation
df_real_data = pd.read_parquet(r"all_for_validation.parquet")
df_real_data.columns

In [22]:

df_real_data['area_in_meters'] = df_real_data['area_in_meters'] / normalize_area
#df_real_data['SMOD_id'] = df_real_data['SMOD_id'] / normalize_smod
df_real_data['normalized_perimeter_to_area_ratio']
df_real_data['radius_m'] = df_real_data['radius_m'] / normalize_radius
df_real_data['distance_to_1'] = df_real_data['distance_to_1'] / 5000
df_real_data['distance_to_2'] = df_real_data['distance_to_2'] / 4000
df_real_data['distance_to_3'] = df_real_data['distance_to_3'] / 3000
df_real_data['distance_to_4'] = df_real_data['distance_to_4'] / 2000
df_real_data['road_density_for_4_fixed'] = df_real_data['road_density_for_4_fixed'] / 60_000
df_real_data['road_density_for_5_fixed'] = df_real_data['road_density_for_5_fixed'] / 75_000
df_real_data['building_density_100'] = df_real_data['building_density_100'] / normalize_density_100
#df_real_data['building_perimeter_in_meters_new'] = df_real_data['building_perimeter_in_meters_new']/500

print("Real data shape:", df_real_data.shape)
df_real_data.columns

df_real_dropped = df_real_data.drop(columns=['id', 'latitude', 'longitude', 'vida_confidence',
       'osm_type', 'geometry', 'trusted_source', 'building_tag',
       'use_for_training', 'L1_class', 'L1_5_class', 'L2_class', 'SMOD_name',
       'SMOD_id', 'image_source_bytes', 'image_ML_type', 'perimeter_in_meters',
       'building_perimeter_in_meters_new', 'perimeter_to_area_ratio', 'centroid', 
       'num_vertices', 'centroid_x', 'centroid_y', 'nearest_road_type_1',
       'nearest_road_type_2', 
       'nearest_road_type_3',  'nearest_road_type_4',
       'building_density_50', 'building_density_250',
       'building_density_500', 'del'])





        #feature 1

        #feature 2
        #row.normalized_perimeter_to_area_ratio,
        # row.internet_towers_nearby_capped/normalize_int_t,
        #row.nearest_internet_tower_distance_km_capped/normalize_int_distance,
        # row.avg_range_k_nearest_with_5/avg_range_k5_capped,
        #feature 3
        # row.road_density_fixed/normalize_road_density,
        #feature 4


        # row.roads_nearby_fixed/normalize_road_count,
        # row.nearest_city_distance_km/normalize_nearest_city,
        #row.height_mean_cappedNEW/normalize_height_mean,
        #feature 10


Real data shape: (400, 41)


In [23]:
df_real_dropped.columns


print("Real data shape:", df_real_dropped.shape)
print("Model input shape:", model.input_shape)

Real data shape: (400, 10)
Model input shape: (None, 10)


In [None]:
experiment_name = "Experiment 1"
experiment = mlflow.get_experiment_by_name(experiment_name)

# Get the latest run
latest_run = mlflow.search_runs(experiment_ids=[experiment.experiment_id], order_by=["start_time desc"]).iloc[0]

# Get the model URI for the latest run
run_id = latest_run.run_id
model_uri = f"runs:/{run_id}/models/neural_network_3cat.keras"
print(f"Model URI: {model_uri}")

Model URI: runs:/e286f510addc471f945ae11a0ab1da0c/models/neural_network_3cat.keras


In [None]:

#local_model_path = "neural_network_3cat.keras"
#model = mlflow.keras.load_model(local_model_path)
mlflow.set_tracking_uri("http://127.0.0.1:5000")

predictions = model.predict(df_real_dropped)


predicted_classes = np.argmax(predictions, axis=1)
class_labels = ['Non-Residential', 'Residential', 'Industrial']
predicted_labels = [class_labels[i] for i in predicted_classes]

for i, label in enumerate(predicted_labels):
    print(f"Instance {i}: Predicted Label: {label}")


import json
predictions_path = "artifacts/predictions.csv"
with open(predictions_path, "w") as f:
    json.dump(predicted_labels, f)

mlflow.log_artifact(predictions_path, artifact_path="predictions")

In [26]:
df_real_data["prediction"] = predicted_labels


In [None]:
#upload validation dataset for merge with prediction
df_real_class = pd.read_parquet(r"all_for_validation.parquet")
df_real_class.columns


In [None]:
df_real_class["id"]
df_real_class = df_real_class[["id", "L1_5_class"]]
df_real_class
df_real_data = df_real_data.merge(df_real_class[["id", "L1_5_class"]], on="id", how="left")
df_real_data

In [None]:
df_real_data.to_csv("to_compare_v1.csv")
mlflow.end_run()

In [None]:
#some quick analysis

df_real_data["prediction_normalized"] = (
    df_real_data["prediction"]
    .str.lower()
    .str.replace(r"[^a-z0-9]", "", regex=True)
)

df_real_data["L1_5_class_y_normalized"] = (
    df_real_data["L1_5_class_y"]
    .str.lower()
    .str.replace(r"[^a-z0-9]", "", regex=True)
)

df_real_data["match"] = (
    df_real_data["prediction_normalized"] == df_real_data["L1_5_class_y_normalized"]
)
df_real_data

true_count = (df_real_data["match"] == True).sum()
false_count = (df_real_data["match"] == False).sum()

print(f"Percentage: {true_count}/400 = {true_count / 400:.2%}")
print(f"True: {true_count}")
print(f"False residential without mixed use: {false_count}")
Counter(df_real_data.prediction)

Percentage: 326/400 = 81.50%
True: 326
False residential without mixed use: 74


Counter({'Residential': 211, 'Non-Residential': 135, 'Industrial': 54})