In [1]:
import pandas as pd

# Load the provided dataset
file_path = r'D:\Prab\IPS-Project\synthetic_data_point\Noon-RSSI\Dataset_cloudSourcing\Combine_All_source.csv'
data = pd.read_csv(file_path)
data.columns = [col.strip() for col in data.columns]
data = data.drop('Date & Time', axis=1)
data.head()

Unnamed: 0,Latitude,Longitude,Floor,CMKL-Guest_2a:3f:0b:56:e6:18,CMKL-Guest_2a:3f:0b:56:e8:f1,CMKL-Guest_2a:3f:0b:56:e8:f7,CMKL-Guest_2a:3f:0b:56:e9:00,CMKL-Guest_2a:3f:0b:56:e9:03,CMKL-Guest_2a:3f:0b:56:e9:15,CMKL-Guest_2a:3f:0b:56:e9:2a,...,_12:3f:1b:57:fa:37,_da:55:a8:26:73:ee,_da:55:a8:26:73:ef,_da:55:b8:26:73:ef,eduroam_34:1e:6b:05:fa:f2,eduroam_9c:50:ee:83:b2:92,eduroam_9c:50:ee:83:b4:52,eduroam_9c:50:ee:83:b4:72,eduroam_9c:50:ee:83:b8:32,guest_b6:fb:e4:a4:60:11
0,13.727892,100.778333,6,-73,-100,-68,-77,-78,-79,-82,...,-100,-100,-80,-100,-100,-59,-100,-56,-73,-100
1,13.727892,100.778333,6,-80,-100,-60,-81,-100,-87,-76,...,-100,-100,-100,-100,-100,-66,-100,-67,-80,-100
2,13.727904,100.778324,6,-79,-100,-61,-78,-85,-100,-71,...,-100,-100,-100,-100,-100,-67,-100,-62,-77,-100
3,13.727908,100.778325,6,-79,-100,-61,-78,-85,-87,-71,...,-100,-100,-100,-100,-100,-67,-100,-62,-77,-100
4,13.727906,100.77833,6,-68,-100,-54,-78,-100,-79,-81,...,-100,-100,-100,-100,-100,-64,-100,-62,-80,-100


In [2]:
import folium

# Create a map centered around the average coordinates
map_center = [data['Latitude'].mean(), data['Longitude'].mean()]
map_ips = folium.Map(location=map_center, zoom_start=30)

# Add each data point as a marker on the map
for idx, row in data.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=3,
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        line_opacity=0.2
    ).add_to(map_ips)

map_ips


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint, uniform
import numpy as np
from sklearn.base import clone

# Define the feature matrix and target vector
X = data.drop(['Latitude', 'Longitude', 'Floor'], axis=1)
y = data[['Latitude', 'Longitude', 'Floor']]

# Splitting the data into training and validation sets (80/20 split)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'KNeighbors': KNeighborsRegressor()
}

# Parameters for Grid Search
grid_params = {
    'LinearRegression': {
        'fit_intercept': [True, False]
    },
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5]
    },
    'KNeighbors': {
        'n_neighbors': [3, 5, 10, 15],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}

# Parameters for Randomized Search
dist_params = {
    'LinearRegression': {
        'fit_intercept': [True, False]
    },
    'DecisionTree': {
        'max_depth': randint(1, 30),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'RandomForest': {
        'n_estimators': randint(100, 300),
        'max_features': ['sqrt', 'log2', None],
        'max_depth': randint(1, 30),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'GradientBoosting': {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.2 - 0.01),
        'max_depth': randint(3, 10),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'KNeighbors': {
        'n_neighbors': randint(3, 20),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}

multi_dist_params = {
    'LinearRegression': {
        'estimator__fit_intercept': [True, False]
    },
    'DecisionTree': {
        'estimator__max_depth': randint(1, 30),
        'estimator__min_samples_split': randint(2, 20),
        'estimator__min_samples_leaf': randint(1, 10)
    },
    'RandomForest': {
        'estimator__n_estimators': randint(100, 300),
        'estimator__max_features': ['sqrt', 'log2', None],
        'estimator__max_depth': randint(1, 30),
        'estimator__min_samples_split': randint(2, 20),
        'estimator__min_samples_leaf': randint(1, 10)
    },
    'GradientBoosting': {
        'estimator__n_estimators': randint(50, 300),
        'estimator__learning_rate': uniform(0.01, 0.2),
        'estimator__max_depth': randint(3, 10),
        'estimator__min_samples_split': randint(2, 20),
        'estimator__min_samples_leaf': randint(1, 10)
    },
    'KNeighbors': {
        'estimator__n_neighbors': randint(3, 20),
        'estimator__weights': ['uniform', 'distance'],
        'estimator__metric': ['euclidean', 'manhattan']
    }
}

# Correct Parameter Grids and Distributions for use with MultiOutputRegressor
multi_grid_params = {
    'LinearRegression': {
        'estimator__fit_intercept': [True, False]
    },
    'DecisionTree': {
        'estimator__max_depth': [None, 10, 20, 30],
        'estimator__min_samples_split': [2, 10, 20],
        'estimator__min_samples_leaf': [1, 5, 10]
    },
    'RandomForest': {
        'estimator__n_estimators': [100, 200, 300],
        'estimator__max_features': ['sqrt', 'log2', None],
        'estimator__max_depth': [None, 10, 20, 30],
        'estimator__min_samples_split': [2, 10, 20],
        'estimator__min_samples_leaf': [1, 5, 10]
    },
    'GradientBoosting': {
        'estimator__n_estimators': [100, 200, 300],
        'estimator__learning_rate': [0.01, 0.1, 0.2],
        'estimator__max_depth': [3, 5, 10],
        'estimator__min_samples_split': [2, 10],
        'estimator__min_samples_leaf': [1, 5]
    },
    'KNeighbors': {
        'estimator__n_neighbors': [3, 5, 10, 15],
        'estimator__weights': ['uniform', 'distance'],
        'estimator__metric': ['euclidean', 'manhattan']
    }
}

# Ensure MultiOutputRegressor is properly applied
multi_models = {name: MultiOutputRegressor(clone(model)) for name, model in models.items()}


# Running searches function
def run_searches(model, params, dist_params, X_train, y_train, X_valid, y_valid, n_iter=10):
    results = {}
    # Ensure the correct parameter grids or distributions are being used
    # Initialize the Grid Search
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_grid_params = grid_search.best_params_
    grid_predictions = grid_search.predict(X_valid)
    grid_rmse = np.sqrt(mean_squared_error(y_valid, grid_predictions))
    results['Grid Search'] = {'Best Parameters': best_grid_params, 'RMSE': grid_rmse}

    # Initialize the Randomized Search
    random_search = RandomizedSearchCV(model, dist_params, n_iter=n_iter, cv=5, scoring='neg_mean_squared_error', random_state=42)
    random_search.fit(X_train, y_train)
    best_random_params = random_search.best_params_
    random_predictions = random_search.predict(X_valid)
    random_rmse = np.sqrt(mean_squared_error(y_valid, random_predictions))
    results['Randomized Search'] = {'Best Parameters': best_random_params, 'RMSE': random_rmse}

    return results

# Running and comparing for each model and approach
def compare_approaches(models, multi_params, multi_dist_params, sep_params, sep_dist_params, X_train, y_train, X_valid, y_valid):
    comparison_results = {}

    # MultiOutput Approach
    multi_results = {}
    for name, model in models.items():
        print(f"Running MultiOutput Approach for {name}")
        wrapped_model = MultiOutputRegressor(clone(model))
        multi_results[name] = run_searches(wrapped_model, multi_params[name], multi_dist_params[name], X_train, y_train, X_valid, y_valid)
    comparison_results['MultiOutput Approach'] = multi_results

    # Separate Approach (assuming y_train and y_valid are DataFrames with 'Latitude' and 'Longitude' columns)
    separate_results = {}
    for name, model in models.items():
        print(f"Running Separate Approach for {name}")
        sep_results = {}
        for target in ['Latitude', 'Longitude']:
            print(f"  Model: {name}, Target: {target}")
            sep_results[target] = run_searches(clone(model), sep_params[name], sep_dist_params[name], X_train, y_train[target], X_valid, y_valid[target])
        separate_results[name] = sep_results
    comparison_results['Separate Approach'] = separate_results

    return comparison_results



# Assuming all necessary variables and model setup are defined
results = compare_approaches(models, multi_grid_params, multi_dist_params, grid_params, dist_params, X_train, y_train, X_valid, y_valid)


Running MultiOutput Approach for LinearRegression




Running MultiOutput Approach for DecisionTree
Running MultiOutput Approach for RandomForest
Running MultiOutput Approach for GradientBoosting
Running MultiOutput Approach for KNeighbors
Running Separate Approach for LinearRegression
  Model: LinearRegression, Target: Latitude




  Model: LinearRegression, Target: Longitude




Running Separate Approach for DecisionTree
  Model: DecisionTree, Target: Latitude
  Model: DecisionTree, Target: Longitude
Running Separate Approach for RandomForest
  Model: RandomForest, Target: Latitude
  Model: RandomForest, Target: Longitude
Running Separate Approach for GradientBoosting
  Model: GradientBoosting, Target: Latitude
  Model: GradientBoosting, Target: Longitude
Running Separate Approach for KNeighbors
  Model: KNeighbors, Target: Latitude
  Model: KNeighbors, Target: Longitude


In [4]:
# Displaying the comparison results
def display_comparison_results(results):
    for approach, models in results.items():
        print(f"Results for {approach}:")
        for model_name, model_results in models.items():
            print(f"  Model: {model_name}")
            if approach == 'Separate Approach':
                for target, search_types in model_results.items():
                    print(f"    Target: {target}")
                    for search_type, metrics in search_types.items():
                        print(f"      {search_type}: RMSE: {metrics['RMSE']:.16f}, Best Params: {metrics['Best Parameters']}")
            else:
                for search_type, metrics in model_results.items():
                    print(f"    {search_type}: RMSE: {metrics['RMSE']:.16f}, Best Params: {metrics['Best Parameters']}")
            print("\n")
            
display_comparison_results(results)

Results for MultiOutput Approach:
  Model: LinearRegression
    Grid Search: RMSE: 0.0000406632052921, Best Params: {'estimator__fit_intercept': False}
    Randomized Search: RMSE: 0.0000406632052921, Best Params: {'estimator__fit_intercept': False}


  Model: DecisionTree
    Grid Search: RMSE: 0.0000399020432775, Best Params: {'estimator__max_depth': 20, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 2}
    Randomized Search: RMSE: 0.0000426502915567, Best Params: {'estimator__max_depth': 26, 'estimator__min_samples_leaf': 3, 'estimator__min_samples_split': 12}


  Model: RandomForest
    Grid Search: RMSE: 0.0000306367825907, Best Params: {'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 300}
    Randomized Search: RMSE: 0.0000313395932230, Best Params: {'estimator__max_depth': 20, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 3, 'est

In [8]:
import copy
results_backup = copy.deepcopy(results)
results_backup

{'MultiOutput Approach': {'LinearRegression': {'Grid Search': {'Best Parameters': {'estimator__fit_intercept': False},
    'RMSE': 4.06632052920617e-05},
   'Randomized Search': {'Best Parameters': {'estimator__fit_intercept': False},
    'RMSE': 4.06632052920617e-05}},
  'DecisionTree': {'Grid Search': {'Best Parameters': {'estimator__max_depth': 20,
     'estimator__min_samples_leaf': 5,
     'estimator__min_samples_split': 2},
    'RMSE': 3.9902043277474925e-05},
   'Randomized Search': {'Best Parameters': {'estimator__max_depth': 26,
     'estimator__min_samples_leaf': 3,
     'estimator__min_samples_split': 12},
    'RMSE': 4.2650291556721826e-05}},
  'RandomForest': {'Grid Search': {'Best Parameters': {'estimator__max_depth': None,
     'estimator__max_features': 'sqrt',
     'estimator__min_samples_leaf': 1,
     'estimator__min_samples_split': 2,
     'estimator__n_estimators': 300},
    'RMSE': 3.0636782590682615e-05},
   'Randomized Search': {'Best Parameters': {'estimator__m

In [11]:
# Function to compare model performances for Floor prediction
def compare_floor_prediction(models, multi_params, multi_dist_params, sep_params, sep_dist_params, X_train, y_train, X_valid, y_valid):
    comparison_results = {}
    separate_results = {}
    for name, model in models.items():
        print(f"Running Separate Approach for {name}")
        sep_results = {}
        for target in ['Latitude', 'Longitude', 'Floor']:
            print(f"  Model: {name}, Target: {target}")
            sep_results[target] = run_searches(clone(model), sep_params[name], sep_dist_params[name], X_train, y_train[target], X_valid, y_valid[target])
        separate_results[name] = sep_results
        
    comparison_results['Separate Approach'] = separate_results

    return comparison_results

# Execute the function and get results
floor_prediction_results = compare_floor_prediction(models, multi_grid_params, multi_dist_params, grid_params, dist_params, X_train, y_train, X_valid, y_valid)

Running Separate Approach for LinearRegression
  Model: LinearRegression, Target: Latitude




  Model: LinearRegression, Target: Longitude




  Model: LinearRegression, Target: Floor




Running Separate Approach for DecisionTree
  Model: DecisionTree, Target: Latitude
  Model: DecisionTree, Target: Longitude
  Model: DecisionTree, Target: Floor
Running Separate Approach for RandomForest
  Model: RandomForest, Target: Latitude
  Model: RandomForest, Target: Longitude
  Model: RandomForest, Target: Floor
Running Separate Approach for GradientBoosting
  Model: GradientBoosting, Target: Latitude
  Model: GradientBoosting, Target: Longitude
  Model: GradientBoosting, Target: Floor
Running Separate Approach for KNeighbors
  Model: KNeighbors, Target: Latitude
  Model: KNeighbors, Target: Longitude
  Model: KNeighbors, Target: Floor


In [12]:
display_comparison_results(floor_prediction_results)

Results for Separate Approach:
  Model: LinearRegression
    Target: Latitude
      Grid Search: RMSE: 0.0000547577234539, Best Params: {'fit_intercept': False}
      Randomized Search: RMSE: 0.0000547577234539, Best Params: {'fit_intercept': False}
    Target: Longitude
      Grid Search: RMSE: 0.0000442953780435, Best Params: {'fit_intercept': False}
      Randomized Search: RMSE: 0.0000442953780435, Best Params: {'fit_intercept': False}
    Target: Floor
      Grid Search: RMSE: 0.0000000000000000, Best Params: {'fit_intercept': True}
      Randomized Search: RMSE: 0.0000000000000000, Best Params: {'fit_intercept': True}


  Model: DecisionTree
    Target: Latitude
      Grid Search: RMSE: 0.0000582996822389, Best Params: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 20}
      Randomized Search: RMSE: 0.0000622827270444, Best Params: {'max_depth': 26, 'min_samples_leaf': 3, 'min_samples_split': 12}
    Target: Longitude
      Grid Search: RMSE: 0.0000407159997224, Be

In [13]:
results.update(floor_prediction_results)

In [14]:
display_comparison_results(results)

Results for MultiOutput Approach:
  Model: LinearRegression
    Grid Search: RMSE: 0.0000406632052921, Best Params: {'estimator__fit_intercept': False}
    Randomized Search: RMSE: 0.0000406632052921, Best Params: {'estimator__fit_intercept': False}


  Model: DecisionTree
    Grid Search: RMSE: 0.0000399020432775, Best Params: {'estimator__max_depth': 20, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 2}
    Randomized Search: RMSE: 0.0000426502915567, Best Params: {'estimator__max_depth': 26, 'estimator__min_samples_leaf': 3, 'estimator__min_samples_split': 12}


  Model: RandomForest
    Grid Search: RMSE: 0.0000306367825907, Best Params: {'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 300}
    Randomized Search: RMSE: 0.0000313395932230, Best Params: {'estimator__max_depth': 20, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 3, 'est

In [15]:
import os
from joblib import dump
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor

# Define a function to create a model instance from parameters
def create_model(name, params):
    if name == 'LinearRegression':
        return LinearRegression(**params)
    elif name == 'DecisionTree':
        return DecisionTreeRegressor(random_state=42, **params)
    elif name == 'RandomForest':
        return RandomForestRegressor(random_state=42, **params)
    elif name == 'GradientBoosting':
        return GradientBoostingRegressor(random_state=42, **params)
    elif name == 'KNeighbors':
        return KNeighborsRegressor(**params)

# Directory to save models
model_dir = 'New_noon_saved_models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Function to process and save models based on search results
def process_and_save_models(results):
    for approach, models in results.items():
        if approach == 'MultiOutput Approach':
            for model_name, searches in models.items():
                for search_type, search_details in searches.items():
                    model = create_model(model_name, {k.split('__')[1]: v for k, v in search_details['Best Parameters'].items()})
                    wrapped_model = MultiOutputRegressor(model)
                    wrapped_model.fit(X_train, y_train)  # Ensure X_train and y_train are defined
                    dump(wrapped_model, os.path.join(model_dir, f'{model_name}_{search_type}_multioutput.joblib'))
        elif approach == 'Separate Approach':
            for model_name, targets in models.items():
                for target, searches in targets.items():
                    for search_type, search_details in searches.items():
                        model = create_model(model_name, search_details['Best Parameters'])
                        model.fit(X_train, y_train[target])  # Ensure X_train and y_train[target] are defined
                        dump(model, os.path.join(model_dir, f'{model_name}_{target}_{search_type}.joblib'))

# Example usage
process_and_save_models(results)