In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
import pickle


In [3]:

# Load the pickled dataframe
data_dir = 'C:/Users/ckunde/Desktop/Scripts/A1/ball_size_ml/data'
pickle_path = os.path.join(data_dir, 'combined_data.pkl')

with open(pickle_path, 'rb') as f:
    df = pickle.load(f)

# Define the features to use
features = [
    'Op10Diameter1_4Measurement',
    'Op10Diameter2_5Measurement',
    'Op10Diameter3_6Measurement',
    'Op20Diameter1_4Measurement',
    'Op20Diameter2_5Measurement',
    'Op20Diameter3_6Measurement',
    'Op10MasterTempChangeMeasurement',
    'Op20MasterTempChangeMeasurement',
    'Op20PickedBallSize'
]

# Select the features and target
X = df[features]
y = df['Op40ForceAverage']

# Drop rows with NaN values in X or y
X = X.dropna()
y = y[X.index]


In [4]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

# Create a random forest regressor model
model = RandomForestRegressor(random_state=42)

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', verbose=2)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean squared error: {mse}')
print(f'R2 score: {r2}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  42.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  45.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  39.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.4min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 2.1min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 2.4min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 2.2min
[CV] END max_depth=5, min_samples_leaf