# **Distance Estimator Model**

## **Project Setup**
### **Import libraries**

In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Add the project directory to the path for imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    
print(f"Added project root to sys.path: {project_root}")

## **Data Preparation**
### **Load and preprocess the data**

In [None]:
# Import the data preparation module
from data_preparation import preprocess

# Load and prepare the data for regression
X_train, X_test, y_train, y_test, feature_names, raw_data, processed_data = preprocess.prepare_data('regression')

# Display the shapes of the data
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Feature names: {feature_names}")

### **Exploratory Data Analysis**

In [None]:
# Create a DataFrame for easier exploration
df_train = pd.DataFrame(X_train, columns=feature_names)
df_train['Range'] = y_train

# Summary statistics
print("Summary statistics:")
df_train.describe()

In [None]:
# Distribution of the target variable (Range) - WITHOUT KDE to avoid errors
plt.figure(figsize=(10, 6))
sns.histplot(df_train['Range'], kde=False)
plt.title('Distribution of Range Values')
plt.xlabel('Range')
plt.ylabel('Count')
plt.show()

In [None]:
# Range distribution by NLOS flag
if 'NLOS' in df_train.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='NLOS', y='Range', data=df_train)
    plt.title('Range Distribution by NLOS Flag')
    plt.xlabel('NLOS (0=LOS, 1=NLOS)')
    plt.ylabel('Range')
    plt.show()

In [None]:
# Scatter plots for each feature against the target variable
num_features = len(feature_names)
num_rows = (num_features + 1) // 2  # Calculate number of rows needed

fig, axes = plt.subplots(num_rows, 2, figsize=(15, 4 * num_rows))
axes = axes.flatten()

for i, feature in enumerate(feature_names):
    sns.scatterplot(x=feature, y='Range', data=df_train, ax=axes[i], alpha=0.5)
    axes[i].set_title(f'{feature} vs Range')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Range')
    
# Remove any unused axes
for i in range(num_features, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df_train.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## **Model Training and Evaluation**
### **Train various regression models**

In [None]:
# Define the regression models to train
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1)
}

# Standardize the features for models that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dictionary to store the results
results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    
    # Train the model (use scaled data for some models)
    if name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'SVR']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Feature importance
    if hasattr(model, 'feature_importances_'):
        feature_importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        feature_importance = np.abs(model.coef_)
    else:
        feature_importance = None
    
    # Store the results and predictions
    results[name] = {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'feature_importance': feature_importance,
        'y_pred': y_pred
    }
    
    print(f"  MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

### **Visualize model performance**

In [None]:
# Performance comparison
metrics = ['mse', 'rmse', 'mae', 'r2']
labels = ['MSE', 'RMSE', 'MAE', 'R²']
comparison_df = pd.DataFrame({name: [results[name][metric] for metric in metrics] for name in results.keys()}, 
                             index=labels)

# Plot MSE, RMSE, MAE
plt.figure(figsize=(12, 8))
comparison_df.iloc[:3].plot(kind='bar')
plt.title('Model Error Metrics Comparison')
plt.ylabel('Error')
plt.xlabel('Metric')
plt.legend(title='Model')
plt.grid(axis='y')
plt.show()

# Plot R²
plt.figure(figsize=(12, 6))
comparison_df.iloc[3:].plot(kind='bar')
plt.title('Model R² Comparison')
plt.ylabel('R²')
plt.xlabel('Metric')
plt.legend(title='Model')
plt.grid(axis='y')
plt.show()

In [None]:
# Actual vs Predicted plots for each model
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 12))
axes = axes.flatten()

for i, name in enumerate(results):
    if i < len(axes):
        axes[i].scatter(y_test, results[name]['y_pred'], alpha=0.5)
        axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
        axes[i].set_title(f'Actual vs Predicted - {name}')
        axes[i].set_xlabel('Actual Range')
        axes[i].set_ylabel('Predicted Range')
        axes[i].text(0.05, 0.95, f"R² = {results[name]['r2']:.3f}\nRMSE = {results[name]['rmse']:.3f}", 
                 transform=axes[i].transAxes, fontsize=12, verticalalignment='top')

# Remove any unused axes
for i in range(len(results), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

### **Feature importance analysis**

In [None]:
# Feature importance for best performing model (highest R²)
best_model_name = max(results, key=lambda x: results[x]['r2'])
print(f"Best performing model based on R²: {best_model_name}")

if results[best_model_name]['feature_importance'] is not None:
    # Create a DataFrame for feature importance
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': results[best_model_name]['feature_importance']
    })
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance ({best_model_name})')
    plt.tight_layout()
    plt.show()

### **Error Analysis**

In [None]:
# Error analysis for the best model
best_model_name = max(results, key=lambda x: results[x]['r2'])
y_pred = results[best_model_name]['y_pred']
errors = y_test - y_pred

# Create DataFrame for error analysis
error_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Error': errors
})

# Error distribution
plt.figure(figsize=(10, 6))
sns.histplot(error_df['Error'], kde=False)
plt.title(f'Error Distribution for {best_model_name}')
plt.xlabel('Error (Actual - Predicted)')
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
# Error vs Actual plot
plt.figure(figsize=(10, 6))
plt.scatter(error_df['Actual'], error_df['Error'], alpha=0.5)
plt.axhline(y=0, color='r', linestyle='-')
plt.title('Error vs Actual Range')
plt.xlabel('Actual Range')
plt.ylabel('Error')
plt.grid(True)
plt.show()

## **Conclusion and Recommendations**

In [None]:
# Determine the best model
best_model_rmse = min(results, key=lambda x: results[x]['rmse'])
best_model_r2 = max(results, key=lambda x: results[x]['r2'])

print(f"Best model by RMSE: {best_model_rmse} ({results[best_model_rmse]['rmse']:.4f})")
print(f"Best model by R²: {best_model_r2} ({results[best_model_r2]['r2']:.4f})")

# Recommendations based on the analysis
print("\nRecommendations:")
print(f"1. Use {best_model_r2} for range prediction due to its high R² and low error.")
print("2. Important features for range prediction (from most to least important):")
if results[best_model_r2]['feature_importance'] is not None:
    for feature, importance in importance_df.head(5).values:
        print(f"   - {feature}: {importance:.4f}")
print("3. Focus on these features when optimizing the range prediction model.")
print(f"4. Error analysis shows that the model has a mean error of {errors.mean():.4f} with a standard deviation of {errors.std():.4f}.")