In [5]:
import pandas as pd
import os

original_data = pd.read_csv('original_CMKL1.csv')
synthetic_data = pd.read_csv('synthetic_data_RandomSeaerch_Ensemble.csv')

# Create directory for saving models
if not os.path.exists('5KNN_senior_saved_model'):
    os.makedirs('5KNN_senior_saved_model')
    
# Combine the datasets with a 90:10 ratio of original to synthetic data
original_ratio = 0.9
synthetic_ratio = 0.1

# Calculate the number of samples to take from each dataset
n_original = int(len(original_data) * original_ratio)
n_synthetic = int(len(original_data) * synthetic_ratio)

# Sample the data
combined_data = pd.concat([original_data.sample(n=n_original, random_state=42), 
                           synthetic_data.sample(n=n_synthetic, random_state=42)], 
                          axis=0, ignore_index=True)

# Selecting only the columns related to 18 RSSI values, x, y, and z
selected_columns = ['RSSI1', 'RSSI2', 'RSSI3', 'RSSI4', 'RSSI5', 'RSSI6', 'RSSI7', 'RSSI8', 'RSSI9', 
                    'RSSI10', 'RSSI11', 'RSSI12', 'RSSI13', 'RSSI14', 'RSSI15', 'RSSI16', 'RSSI17', 'RSSI18', 
                    'x', 'y', 'z']

# Filter the combined data to include only the selected columns
filtered_data = combined_data[selected_columns]

filtered_data.head()

Unnamed: 0,RSSI1,RSSI2,RSSI3,RSSI4,RSSI5,RSSI6,RSSI7,RSSI8,RSSI9,RSSI10,...,RSSI12,RSSI13,RSSI14,RSSI15,RSSI16,RSSI17,RSSI18,x,y,z
0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-65.0,-100.0,-66.0,...,-100.0,-85.0,-86.0,-100.0,-100.0,-88.0,-100.0,1.2,33.599999,0.0
1,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-59.0,-100.0,-59.0,...,-100.0,-100.0,-84.0,-84.0,-100.0,-100.0,-100.0,0.0,33.599999,0.0
2,-52.0,-79.0,-75.0,-79.0,-100.0,-81.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,8.4,33.599998,1.0
3,-100.0,-100.0,-100.0,-88.0,-82.0,-80.0,-100.0,-100.0,-100.0,-86.0,...,-78.0,-100.0,-66.0,-66.0,-71.0,-71.0,-77.0,8.4,14.4,0.0
4,-100.0,-84.0,-100.0,-100.0,-100.0,-70.0,-76.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,9.3,17.4,1.0


In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
import joblib
import numpy as np

# Define the mean distance error function
def mean_distance_error(y_true, y_pred):
    return np.mean(np.sqrt(np.sum((y_true - y_pred) ** 2, axis=1)))

# Create a custom scorer for GridSearchCV
mean_distance_error_scorer = make_scorer(mean_distance_error, greater_is_better=False)

# Splitting the filtered data into features and target
X = filtered_data.iloc[:, :-3]  # 18 RSSI values
y = filtered_data[['x', 'y']]  # Target values: x, y coordinates

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining the parameter grid for hyperparameter tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50]
}

# Create 5 KNN models with hyperparameter tuning using GridSearchCV and mean distance error
knn_models = []
for i in range(5):
    knn = KNeighborsRegressor()
    grid_search = GridSearchCV(knn, param_grid, cv=5, scoring=mean_distance_error_scorer)
    grid_search.fit(X_train, y_train)
    best_knn = grid_search.best_estimator_
    knn_models.append(('knn_' + str(i), best_knn))
    
print(knn_models)

# Creating an ensemble of 5 tuned KNN models using VotingRegressor wrapped in MultiOutputRegressor
multi_output_ensemble_model = MultiOutputRegressor(VotingRegressor(estimators=knn_models))
multi_output_ensemble_model.fit(X_train, y_train)

# Predicting using the multi-output ensemble model
y_pred = multi_output_ensemble_model.predict(X_test)

# Calculating the mean distance error
mean_dist_error = mean_distance_error(y_test.values, y_pred)

mean_dist_error

[('knn_0', KNeighborsRegressor(algorithm='ball_tree', leaf_size=50, n_neighbors=3,
                    weights='distance')), ('knn_1', KNeighborsRegressor(algorithm='ball_tree', leaf_size=50, n_neighbors=3,
                    weights='distance')), ('knn_2', KNeighborsRegressor(algorithm='ball_tree', leaf_size=50, n_neighbors=3,
                    weights='distance')), ('knn_3', KNeighborsRegressor(algorithm='ball_tree', leaf_size=50, n_neighbors=3,
                    weights='distance')), ('knn_4', KNeighborsRegressor(algorithm='ball_tree', leaf_size=50, n_neighbors=3,
                    weights='distance'))]


0.8965521525189176

In [9]:
# Save the ensemble model to the specified directory
model_save_path = '5KNN_senior_saved_model/ensemble_knn_multioutput_modelใ'
joblib.dump(multi_output_ensemble_model, model_save_path)

print(f"Model saved to: {model_save_path}")

Model saved to: 5KNN_senior_saved_model/ensemble_knn_multioutput_model.pkl
