In [1]:
import pandas as pd
from helpers.misc import display_dict
from helpers.modelComparison import evaluate_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
input_path = "Data/"

In [3]:
df = pd.read_excel(input_path + 'Data.xlsx')

# Separate features and target
X = df.drop('LapTime', axis=1)
features = X.copy()\
            .drop(['year', 'round', 'DriverNumber'], axis=1)
y = df['LapTime']

# Split data
X_train, X_test, y_train, y_test = \
            train_test_split(
                features, y, 
                test_size=0.2, 
                random_state=42
            )

# Linear Regression

In [4]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
evaluate_model(y_test, lr_pred, "Linear Regression")

Linear Regression Metrics:
	- Mean Absolute Error: 3.9319
	- R-squared Score: 0.7397


# Decision Tree

## Hyperparameter Tuning

In [5]:
dt_param_grid = {
    'max_depth': [5, 10, 15, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42), 
    dt_param_grid, 
    cv=5, 
    scoring='neg_mean_absolute_error'
)
dt_grid_search.fit(X_train, y_train)
display_dict(
                dt_grid_search.best_params_,
                title="Best Decision Tree Parameters:"
            )

Best Decision Tree Parameters:
	- max_depth: 20.0000
	- min_samples_leaf: 2.0000
	- min_samples_split: 10.0000


In [6]:
dt_best_model = dt_grid_search.best_estimator_
dt_pred = dt_best_model.predict(X_test)
evaluate_model(y_test, dt_pred, "Decision Tree")

Decision Tree Metrics:
	- Mean Absolute Error: 0.9487
	- R-squared Score: 0.9478


## Feature Importance

In [7]:
dt_feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': dt_best_model.feature_importances_
}).sort_values('importance', ascending=False)

dict = {}
for i in range(len(dt_feature_importance)):
    dict[dt_feature_importance.iloc[i]['feature']] = dt_feature_importance.iloc[i]['importance']
display_dict(
                dict,
                title="Decision Tree Feature Importance"
            )

Decision Tree Feature Importance
	- TrackLength: 0.5367
	- ElevationSD: 0.1957
	- CurvatureSD: 0.0533
	- NumberOfCorners: 0.0359
	- Compound_HARD: 0.0316
	- LapsLeft: 0.0199
	- SurfaceGripIndex: 0.0188
	- Rain: 0.0185
	- WindChillFactor: 0.0177
	- TotalCurvature: 0.0099
	- TrackTemp: 0.0098
	- Compound_MEDIUM: 0.0093
	- Compound_INTERMEDIATE: 0.0093
	- Compound_SOFT: 0.0091
	- TotalElevationChange: 0.0068
	- BestLapTimeDelta: 0.0061
	- AirTemp: 0.0040
	- PointsAtStart: 0.0034
	- MaxCurvature: 0.0027
	- HumidityWindInteraction: 0.0015
	- MinElevation: 0.0000
	- Compound_WET: 0.0000


# Random Forest

## Hyperparameter Tuning

In [8]:
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, 15],
    'min_samples_split': [20, 50, 100],
    'min_samples_leaf': [100, 200, 400]
}

rf_grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42), 
    rf_param_grid, 
    cv=5, 
    scoring='neg_mean_absolute_error'
)
rf_grid_search.fit(X_train, y_train)

display_dict(
                rf_grid_search.best_params_,
                title="Best Random Forest Parameters:"
            )

Best Random Forest Parameters:
	- max_depth: 15.0000
	- min_samples_leaf: 100.0000
	- min_samples_split: 20.0000
	- n_estimators: 100.0000


In [9]:
rf_best_model = rf_grid_search.best_estimator_
rf_pred = rf_best_model.predict(X_test)
evaluate_model(y_test, rf_pred, "Random Forest")

Random Forest Metrics:
	- Mean Absolute Error: 1.6796
	- R-squared Score: 0.8922


## Feature Importance

In [10]:
rf_feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': rf_best_model.feature_importances_
}).sort_values('importance', ascending=False)

dict = {}
for i in range(len(rf_feature_importance)):
    dict[rf_feature_importance.iloc[i]['feature']] = rf_feature_importance.iloc[i]['importance']
display_dict(
                dict,
                title="Random Forest Feature Importance"
            )

Random Forest Feature Importance
	- TrackLength: 0.5912
	- TotalElevationChange: 0.1177
	- ElevationSD: 0.0959
	- CurvatureSD: 0.0501
	- NumberOfCorners: 0.0473
	- SurfaceGripIndex: 0.0147
	- WindChillFactor: 0.0138
	- LapsLeft: 0.0124
	- Compound_INTERMEDIATE: 0.0110
	- AirTemp: 0.0098
	- Rain: 0.0092
	- TrackTemp: 0.0072
	- Compound_HARD: 0.0046
	- Compound_MEDIUM: 0.0041
	- TotalCurvature: 0.0035
	- MinElevation: 0.0021
	- MaxCurvature: 0.0020
	- Compound_SOFT: 0.0018
	- BestLapTimeDelta: 0.0008
	- PointsAtStart: 0.0005
	- HumidityWindInteraction: 0.0004
	- Compound_WET: 0.0000
