In [12]:
import pandas as pd

# Load the provided dataset
file_path = r'D:\Prab\IPS-Project\synthetic_data_point\Noon-RSSI\sample1.csv'
data = pd.read_csv(file_path, skiprows=[0])  # Skip the redundant header line
data.columns = [col.strip() for col in data.columns]
data = data.drop('Date & Time', axis=1)
data.head()

Unnamed: 0,Latitude,Longitude,CMKL-Guest_2a:3f:0b:56:e9:15,CMKL_2e:3f:1b:56:e6:18,CMKL-TV_26:3f:0b:56:e6:18,CMKL-TV_26:3f:0b:56:e9:00,CMKL-TV_26:3f:0b:56:e9:03,CMKL-Guest_2a:3f:0b:56:e6:18,CMKL-TV_26:3f:0b:57:fa:37,KMITL-IoT_9c:50:ee:83:b2:84,...,CMKL_2e:3f:1b:56:e8:fb,CMKL-Guest_2a:3f:1b:56:e8:f1,CMKL-Guest_2a:3f:1b:56:e8:f7,CMKL-TV_26:3f:1b:56:e8:f1,CMKL-TV_26:3f:1b:56:e8:f7,_12:3f:1b:56:e6:18,CMKL-Guest_e2:55:a8:26:73:ef,CMKL_2c:3f:0b:56:e9:15,CMKL-Guest_2a:3f:0b:57:fa:37,KMITL-IoT_9c:50:ee:83:b4:44
0,13.727905,100.778287,-100,-68,-100,-100,-100,-87,-100,-78,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,-100
1,13.727904,100.778309,-100,-56,-68,-77,-100,-64,-82,-63,...,-84,-77,-66,-78,-61,-100,-100,-100,-82,-100
2,13.727918,100.778306,-100,-57,-66,-84,-100,-66,-100,-69,...,-100,-100,-63,-85,-70,-100,-100,-100,-100,-100
3,13.727931,100.778282,-100,-41,-63,-100,-75,-60,-100,-73,...,-90,-100,-71,-100,-71,-100,-100,-100,-100,-100
4,13.727931,100.778282,-100,-41,-63,-100,-75,-60,-100,-73,...,-90,-100,-71,-100,-71,-100,-100,-100,-100,-100


In [13]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint, uniform
import numpy as np
from sklearn.base import clone

# Define the feature matrix and target vector
X = data.drop(['Latitude', 'Longitude'], axis=1)
y = data[['Latitude', 'Longitude']]

# Splitting the data into training and validation sets (80/20 split)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'KNeighbors': KNeighborsRegressor()
}

# Parameters for Grid Search
grid_params = {
    'LinearRegression': {
        'fit_intercept': [True, False]
    },
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5]
    },
    'KNeighbors': {
        'n_neighbors': [3, 5, 10, 15],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}

# Parameters for Randomized Search
dist_params = {
    'LinearRegression': {
        'fit_intercept': [True, False]
    },
    'DecisionTree': {
        'max_depth': randint(1, 30),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'RandomForest': {
        'n_estimators': randint(100, 300),
        'max_features': ['sqrt', 'log2', None],
        'max_depth': randint(1, 30),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'GradientBoosting': {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(3, 10),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'KNeighbors': {
        'n_neighbors': randint(3, 20),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}

multi_dist_params = {
    'LinearRegression': {
        'estimator__fit_intercept': [True, False]
    },
    'DecisionTree': {
        'estimator__max_depth': randint(1, 30),
        'estimator__min_samples_split': randint(2, 20),
        'estimator__min_samples_leaf': randint(1, 10)
    },
    'RandomForest': {
        'estimator__n_estimators': randint(100, 300),
        'estimator__max_features': ['sqrt', 'log2', None],
        'estimator__max_depth': randint(1, 30),
        'estimator__min_samples_split': randint(2, 20),
        'estimator__min_samples_leaf': randint(1, 10)
    },
    'GradientBoosting': {
        'estimator__n_estimators': randint(50, 300),
        'estimator__learning_rate': uniform(0.01, 0.2),
        'estimator__max_depth': randint(3, 10),
        'estimator__min_samples_split': randint(2, 20),
        'estimator__min_samples_leaf': randint(1, 10)
    },
    'KNeighbors': {
        'estimator__n_neighbors': randint(3, 20),
        'estimator__weights': ['uniform', 'distance'],
        'estimator__metric': ['euclidean', 'manhattan']
    }
}

# Correct Parameter Grids and Distributions for use with MultiOutputRegressor
multi_grid_params = {
    'LinearRegression': {
        'estimator__fit_intercept': [True, False]
    },
    'DecisionTree': {
        'estimator__max_depth': [None, 10, 20, 30],
        'estimator__min_samples_split': [2, 10, 20],
        'estimator__min_samples_leaf': [1, 5, 10]
    },
    'RandomForest': {
        'estimator__n_estimators': [100, 200, 300],
        'estimator__max_features': ['sqrt', 'log2', None],
        'estimator__max_depth': [None, 10, 20, 30],
        'estimator__min_samples_split': [2, 10, 20],
        'estimator__min_samples_leaf': [1, 5, 10]
    },
    'GradientBoosting': {
        'estimator__n_estimators': [100, 200, 300],
        'estimator__learning_rate': [0.01, 0.1, 0.2],
        'estimator__max_depth': [3, 5, 10],
        'estimator__min_samples_split': [2, 10],
        'estimator__min_samples_leaf': [1, 5]
    },
    'KNeighbors': {
        'estimator__n_neighbors': [3, 5, 10, 15],
        'estimator__weights': ['uniform', 'distance'],
        'estimator__metric': ['euclidean', 'manhattan']
    }
}

# Define VotingRegressor for Latitude and Longitude separately
voting_regressor_lat = VotingRegressor(
    estimators=[(name, clone(model)) for name, model in models.items()]
)
voting_regressor_long = VotingRegressor(
    estimators=[(name, clone(model)) for name, model in models.items()]
)


# Ensure MultiOutputRegressor is properly applied
multi_models = {name: MultiOutputRegressor(clone(model)) for name, model in models.items()}

def run_voting_regressor(voting_regressor, X_train, y_train, X_valid, y_valid):
    voting_regressor.fit(X_train, y_train)
    predictions = voting_regressor.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, predictions))
    return rmse

# Running searches function
def run_searches(model, params, dist_params, X_train, y_train, X_valid, y_valid, n_iter=10):
    results = {}
    # Ensure the correct parameter grids or distributions are being used
    # Initialize the Grid Search
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_grid_params = grid_search.best_params_
    grid_predictions = grid_search.predict(X_valid)
    grid_rmse = np.sqrt(mean_squared_error(y_valid, grid_predictions))
    results['Grid Search'] = {'Best Parameters': best_grid_params, 'RMSE': grid_rmse}

    # Initialize the Randomized Search
    random_search = RandomizedSearchCV(model, dist_params, n_iter=n_iter, cv=5, scoring='neg_mean_squared_error', random_state=42)
    random_search.fit(X_train, y_train)
    best_random_params = random_search.best_params_
    random_predictions = random_search.predict(X_valid)
    random_rmse = np.sqrt(mean_squared_error(y_valid, random_predictions))
    results['Randomized Search'] = {'Best Parameters': best_random_params, 'RMSE': random_rmse}

    return results

# Running and comparing for each model and approach
def compare_approaches(models, multi_params, multi_dist_params, sep_params, sep_dist_params, X_train, y_train, X_valid, y_valid):
    comparison_results = {}

    # MultiOutput Approach
    multi_results = {}
    for name, model in models.items():
        print(f"Running MultiOutput Approach for {name}")
        wrapped_model = MultiOutputRegressor(clone(model))
        multi_results[name] = run_searches(wrapped_model, multi_params[name], multi_dist_params[name], X_train, y_train, X_valid, y_valid)
    comparison_results['MultiOutput Approach'] = multi_results
    
    separate_results = {}
    
    # Voting for Latitude
    print("Running Voting Regressor for Latitude")
    lat_rmse = run_voting_regressor(voting_regressor_lat, X_train, y_train['Latitude'], X_valid, y_valid['Latitude'])
    separate_results['Latitude Voting'] = lat_rmse
    
    # Voting for Longitude
    print("Running Voting Regressor for Longitude")
    long_rmse = run_voting_regressor(voting_regressor_long, X_train, y_train['Longitude'], X_valid, y_valid['Longitude'])
    separate_results['Longitude Voting'] = long_rmse
    
    comparison_results['Separate Approach'] = separate_results

    return comparison_results



# Assuming that voting_regressor_lat and voting_regressor_long are properly defined
results = compare_approaches(models, multi_grid_params, multi_dist_params, grid_params, dist_params, X_train, y_train, X_valid, y_valid)


Running MultiOutput Approach for LinearRegression
Running MultiOutput Approach for DecisionTree




Running MultiOutput Approach for RandomForest
Running MultiOutput Approach for GradientBoosting
Running MultiOutput Approach for KNeighbors
Running Voting Regressor for Latitude
Running Voting Regressor for Longitude


In [19]:
print(results)

{'MultiOutput Approach': {'LinearRegression': {'Grid Search': {'Best Parameters': {'estimator__fit_intercept': False}, 'RMSE': 0.845894002885187}, 'Randomized Search': {'Best Parameters': {'estimator__fit_intercept': False}, 'RMSE': 0.845894002885187}}, 'DecisionTree': {'Grid Search': {'Best Parameters': {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2}, 'RMSE': 2.4547888689619853e-05}, 'Randomized Search': {'Best Parameters': {'estimator__max_depth': 25, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 11}, 'RMSE': 2.430478073945606e-05}}, 'RandomForest': {'Grid Search': {'Best Parameters': {'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__n_estimators': 300}, 'RMSE': 2.267106155326693e-05}, 'Randomized Search': {'Best Parameters': {'estimator__max_depth': 28, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf

In [15]:
def print_best_cross_validation_results(results):
    for model_name, model_results in results.items():
        print(f"Results for {model_name}:")
        for target, target_results in model_results.items():
            print(f"  Target: {target}")
            grid_info = target_results['Grid Search']
            random_info = target_results['Randomized Search']
            
            # Determine which search method has the lower cross-validated RMSE
            if grid_info['Cross-validated RMSE'] < random_info['Cross-validated RMSE']:
                best_method = 'Grid Search'
                best_params = grid_info['Best Parameters']
                best_rmse = grid_info['Cross-validated RMSE']
            else:
                best_method = 'Randomized Search'
                best_params = random_info['Best Parameters']
                best_rmse = random_info['Cross-validated RMSE']

            print(f"    Best Method: {best_method}")
            print(f"    Best Parameters: {best_params}")
            print(f"    Best Cross-validated RMSE: {best_rmse:16f}")
            print("\n")

# Assuming 'results' is the dictionary with all your search results
print_best_cross_validation_results(results)


Results for MultiOutput Approach:
  Target: LinearRegression


KeyError: 'Cross-validated RMSE'