In [6]:
import numpy as np
import pandas as pd
import time
import psutil
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from joblib import dump, load
import os
from sklearn.multioutput import MultiOutputRegressor

# Load datasets
actual_data = pd.read_csv('original_CMKL1.csv')

# Create directory for saving models
if not os.path.exists('original_NEW_senior_saved_model'):
    os.makedirs('original_NEW_senior_saved_model')

# Define models and hyperparameters for grid search
classification_models = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 150, 200],
            'max_depth': [10, 15, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'max_depth': [10, 15, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 150, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 10],
            'weights': ['uniform', 'distance']
        }
    }
}

regression_models = {
    'RandomForestRegressor': {
        'model': MultiOutputRegressor(RandomForestRegressor(random_state=42)),
        'params': {
            'estimator__n_estimators': [100, 150, 200],
            'estimator__max_depth': [10, 15, 20],
            'estimator__min_samples_split': [2, 5, 10]
        }
    },
    'LinearRegression': {
        'model': MultiOutputRegressor(LinearRegression()),
        'params': {
            'estimator__fit_intercept': [True, False]
        }
    },
    'DecisionTreeRegressor': {
        'model': MultiOutputRegressor(DecisionTreeRegressor(random_state=42)),
        'params': {
            'estimator__max_depth': [10, 15, 20],
            'estimator__min_samples_split': [2, 5, 10]
        }
    },
    'GradientBoostingRegressor': {
        'model': MultiOutputRegressor(GradientBoostingRegressor(random_state=42)),
        'params': {
            'estimator__n_estimators': [100, 150, 200],
            'estimator__learning_rate': [0.01, 0.1, 0.2],
            'estimator__max_depth': [3, 5, 7]
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 7, 10],
            'weights': ['uniform', 'distance']
        }
    }
}

# Function to perform grid search and train the model
def train_model_with_grid_search(X_train, y_train, model_config):
    grid_search = GridSearchCV(model_config['model'], model_config['params'], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, -grid_search.best_score_, grid_search.best_params_

# Function to calculate Mean Distance Error
def mean_distance_error(y_true, y_pred):
    return np.mean(np.sqrt(np.sum((y_true - y_pred) ** 2, axis=1)))

# Train and evaluate models
results = {}
        
# Floor classification training and evaluation
X_train, X_test, y_train, y_test = train_test_split(actual_data[[f'RSSI{i+1}' for i in range(18)]], actual_data['z'], test_size=0.3, random_state=42)
for model_name, model_config in classification_models.items():
    start_time = time.time()
    process = psutil.Process()
    initial_memory = process.memory_info().rss / (1024 ** 2)  # Convert to MB
    
    best_model, best_score, best_params = train_model_with_grid_search(X_train, y_train, model_config)
    
    training_time = time.time() - start_time
    final_memory = process.memory_info().rss / (1024 ** 2)
    training_memory = max(0, final_memory - initial_memory)  # Ensure memory usage is not negative
    
    start_time = time.time()
    initial_memory = process.memory_info().rss / (1024 ** 2)
    y_pred = best_model.predict(X_test)
    
    prediction_time = time.time() - start_time
    final_memory = process.memory_info().rss / (1024 ** 2)
    prediction_memory = max(0, final_memory - initial_memory)  # Ensure memory usage is not negative
    
    mae = mean_absolute_error(y_test, y_pred)
    
    results[f'original_{model_name}_floor'] = {
        'Mean Absolute Error': mae,
        'Training Time (s)': training_time,
        'Prediction Time (s)': prediction_time,
        'Memory Usage During Training (MB)': training_memory,
        'Memory Usage During Prediction (MB)': prediction_memory,
        'Best Parameters': best_params
    }
    dump(best_model, f'original_NEW_senior_saved_model/original_{model_name}_floor.joblib')

# Coordinate regression training and evaluation
X_train, X_test, y_train, y_test = train_test_split(actual_data[[f'RSSI{i+1}' for i in range(18)]], actual_data[['x', 'y']], test_size=0.3, random_state=42)
for model_name, model_config in regression_models.items():
    start_time = time.time()
    process = psutil.Process()
    initial_memory = process.memory_info().rss / (1024 ** 2)  # Convert to MB
    
    best_model, best_score, best_params = train_model_with_grid_search(X_train, y_train, model_config)
    
    training_time = time.time() - start_time
    final_memory = process.memory_info().rss / (1024 ** 2)
    training_memory = max(0, final_memory - initial_memory)  # Ensure memory usage is not negative
    
    start_time = time.time()
    initial_memory = process.memory_info().rss / (1024 ** 2)
    y_pred = best_model.predict(X_test)
    
    prediction_time = time.time() - start_time
    final_memory = process.memory_info().rss / (1024 ** 2)
    prediction_memory = max(0, final_memory - initial_memory)  # Ensure memory usage is not negative
    
    mde = mean_distance_error(y_test, y_pred)
    
    results[f'original_{model_name}_coord'] = {
        'Mean Distance Error (meters)': mde,
        'Training Time (s)': training_time,
        'Prediction Time (s)': prediction_time,
        'Memory Usage During Training (MB)': training_memory,
        'Memory Usage During Prediction (MB)': prediction_memory,
        'Best Parameters': best_params
    }
    dump(best_model, f'original_NEW_senior_saved_model/original_{model_name}_coord.joblib')

# Print results
for key, metrics in results.items():
    print(f'Results for {key}:')
    for metric_name, value in metrics.items():
        print(f'  {metric_name}: {value}')

# Load and test models using the whole actual data
def load_and_predict_with_model(model_path, test_data, is_coordinate_model=False):
    model = load(model_path)
    X_test = test_data[[f'RSSI{i+1}' for i in range(18)]]
    
    process = psutil.Process()
    start_time = time.time()
    initial_memory = process.memory_info().rss / (1024 ** 2)  # Convert to MB

    predictions = model.predict(X_test)
    
    prediction_time = time.time() - start_time
    final_memory = process.memory_info().rss / (1024 ** 2)
    prediction_memory = max(0, final_memory - initial_memory)  # Ensure memory usage is not negative

    if is_coordinate_model:
        y_test = test_data[['x', 'y']]
        mean_dist_error = mean_distance_error(y_test, predictions)
        return {
            'Predictions': predictions,
            'Mean Distance Error (meters)': mean_dist_error,
            'Prediction Time (s)': prediction_time,
            'Memory Usage During Prediction (MB)': prediction_memory
        }
    else:
        y_test = test_data['z']
        mae = mean_absolute_error(y_test, predictions)
        return {
            'Predictions': predictions,
            'Mean Absolute Error': mae,
            'Prediction Time (s)': prediction_time,
            'Memory Usage During Prediction (MB)': prediction_memory
        }

# Example usage of the load_and_predict_with_model function
prediction_results = {}
model_directory = 'original_NEW_senior_saved_model'
classification_model_names = list(classification_models.keys())
regression_model_names = list(regression_models.keys())

# Split the actual data to get a test set
test_data = actual_data  # Using the whole actual data as test set

# Load and predict using each stored model
for model_name in classification_model_names:
    model_path = f'{model_directory}/original_{model_name}_floor.joblib'
    key = f'orignal_{model_name}_floor'
    prediction_results[key] = load_and_predict_with_model(model_path, test_data)
    print(f"Results for {key}:")
    for metric_name, value in prediction_results[key].items():
        print(f"  {metric_name}: {value}")

for model_name in regression_model_names:
    model_path = f'{model_directory}/original_{model_name}_coord.joblib'
    key = f'original_{model_name}_coord'
    prediction_results[key] = load_and_predict_with_model(model_path, test_data, is_coordinate_model=True)
    print(f"Results for {key}:")
    for metric_name, value in prediction_results[key].items():
        print(f"  {metric_name}: {value}")

# Display best parameters in a separate cell
best_params_results = {key: value['Best Parameters'] for key, value in results.items()}
best_params_results


Results for original_RandomForestClassifier_floor:
  Mean Absolute Error: 0.0006802721088435374
  Training Time (s): 4.309059143066406
  Prediction Time (s): 0.003997325897216797
  Memory Usage During Training (MB): 1.84375
  Memory Usage During Prediction (MB): 0.05859375
  Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Results for original_LogisticRegression_floor:
  Mean Absolute Error: 0.0006802721088435374
  Training Time (s): 0.0829167366027832
  Prediction Time (s): 0.00101470947265625
  Memory Usage During Training (MB): 0.59375
  Memory Usage During Prediction (MB): 0
  Best Parameters: {'C': 0.01}
Results for original_DecisionTreeClassifier_floor:
  Mean Absolute Error: 0.0020408163265306124
  Training Time (s): 0.06354570388793945
  Prediction Time (s): 0.0010001659393310547
  Memory Usage During Training (MB): 0
  Memory Usage During Prediction (MB): 0
  Best Parameters: {'max_depth': 15, 'min_samples_split': 2}
Results for original_Gradient

{'original_RandomForestClassifier_floor': {'max_depth': 10,
  'min_samples_split': 2,
  'n_estimators': 100},
 'original_LogisticRegression_floor': {'C': 0.01},
 'original_DecisionTreeClassifier_floor': {'max_depth': 15,
  'min_samples_split': 2},
 'original_GradientBoostingClassifier_floor': {'learning_rate': 0.1,
  'max_depth': 3,
  'n_estimators': 100},
 'original_KNeighborsClassifier_floor': {'n_neighbors': 3,
  'weights': 'uniform'},
 'original_RandomForestRegressor_coord': {'estimator__max_depth': 20,
  'estimator__min_samples_split': 2,
  'estimator__n_estimators': 150},
 'original_LinearRegression_coord': {'estimator__fit_intercept': True},
 'original_DecisionTreeRegressor_coord': {'estimator__max_depth': 20,
  'estimator__min_samples_split': 2},
 'original_GradientBoostingRegressor_coord': {'estimator__learning_rate': 0.1,
  'estimator__max_depth': 7,
  'estimator__n_estimators': 200},
 'original_KNeighborsRegressor_coord': {'n_neighbors': 7,
  'weights': 'distance'}}