In [44]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lars, HuberRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import os
import pickle


In [45]:

# Load the pickled dataframe
data_dir = 'C:/Users/chris/OneDrive/Projekte/IFA/Scripts/A1/ball_size_ml/data'
pickle_path = os.path.join(data_dir, 'combined_data.pkl')

with open(pickle_path, 'rb') as f:
    df = pickle.load(f)
    
# df = df.sample(frac=0.01, random_state=42)

# Define the features to use
features = [
    'Op10Diameter1_4Measurement',
    'Op10Diameter2_5Measurement',
    'Op10Diameter3_6Measurement',
    'Op20Diameter1_4Measurement',
    'Op20Diameter2_5Measurement',
    'Op20Diameter3_6Measurement',
    # 'Op10MasterTempChangeMeasurement',
    # 'Op20MasterTempChangeMeasurement',
    'Op20PickedBallSize'
]

# Select the features and target
X = df[features]
y = df['Op40ForceAverage']

# Drop rows with NaN values in X or y
X = X.dropna()
y = y[X.index]


In [None]:
# Define the hyperparameters to tune for the Lars model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid_lars = {
    'epsilon': [5, 6, 7, 8, 9],
    'alpha': [0, 1e-15, 1e-12, 1e-9, 1e-6, 1e-3]
}

# Create a Lars model
lars_model = HuberRegressor()

# Perform hyperparameter tuning using GridSearchCV
grid_search_lars = GridSearchCV(lars_model, param_grid_lars, cv=3, scoring='r2', verbose=3, n_jobs=-1)
grid_search_lars.fit(X_train, y_train)

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search_lars.best_params_}')

# Evaluate the best model
best_model = grid_search_lars.best_estimator_
y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Root mean squared error: {rmse} (improvement: {int(100 * (1-(rmse/37.4179)))}%)')
print(f'R2 score: {r2}')

# Print feature importances
print('Feature importances:')
for feature, importance in zip(features, best_model.coef_):
    print(f'{feature}: {importance}')

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [38]:
X

Unnamed: 0,Op10Diameter1_4Measurement,Op10Diameter2_5Measurement,Op10Diameter3_6Measurement,Op20Diameter1_4Measurement,Op20Diameter2_5Measurement,Op20Diameter3_6Measurement
0,1.0,3.0,4.0,-5.0,-10.0,-14.0
1,-2.0,6.0,8.0,3.0,5.0,2.0
2,-12.0,-7.0,-15.0,3.0,-10.0,-9.0
3,5.0,14.0,14.0,1.0,-6.0,-1.0
4,-11.0,-2.0,-7.0,12.0,-5.0,2.0
...,...,...,...,...,...,...
639987,-5.0,-7.0,-7.0,8.0,18.0,2.0
639988,13.0,9.0,9.0,19.0,6.0,2.0
639989,-12.0,-2.0,-11.0,-48000.0,-48000.0,-48000.0
639990,2.0,-4.0,-1.0,3.0,6.0,6.0


In [3]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

# Create a random forest regressor model
model = GradientBoostingRegressor(random_state=42)

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', verbose=3, n_jobs= -1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean squared error: {mse}')
print(f'R2 score: {r2}')

# Print feature importances
print('Feature importances:')
for feature, importance in zip(features, best_model.feature_importances_):
    print(f'{feature}: {importance}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 100}
Mean squared error: 14.721951068777516
R2 score: 0.4111787673151007
Feature importances:
Op10Diameter1_4Measurement: 0.028979986637317462
Op10Diameter2_5Measurement: 0.0251081013831695
Op10Diameter3_6Measurement: 0.023403104140223438
Op20Diameter1_4Measurement: 0.5008143014711505
Op20Diameter2_5Measurement: 0.1507582838052862
Op20Diameter3_6Measurement: 0.13205974798256626
Op10MasterTempChangeMeasurement: 0.0
Op20MasterTempChangeMeasurement: 0.037603250569199734
Op40ForceAverage: 0.10127322401108683


In [24]:
df

Unnamed: 0,Op10Diameter1_4Measurement,Op10Diameter2_5Measurement,Op10Diameter3_6Measurement,Op10DiameterAvgMeasurement,Op10MasterTempChangeMeasurement,Op20Diameter1_4Measurement,Op20Diameter2_5Measurement,Op20Diameter3_6Measurement,Op20DiameterAvgMeasurement,Op20MasterTempChangeMeasurement,Op20PickedBallSize,Op20PickedBallSize1,Op35AAssembly1BallSize,Op35BAssembly2BallSize,Op40ForceTestNotok,Op50ForceTestOk2,Op50ForcePlusData,Op40ForceMinusData,Op40ForceAverage,DiameterAvgDiff,Diameter1_4Diff,Diameter2_5Diff,Diameter3_6Diff
0,1.0,3.0,4.0,2.0,0.0,-5.0,-10.0,-14.0,-7.5,0.045,14.0,14.0,0.0,0.0,0.0,1.0,108.4270,0.0,117.3480,-9.5,-6.0,-13.0,-18.0
1,-2.0,6.0,8.0,2.0,0.0,3.0,5.0,2.0,4.0,0.006,0.0,0.0,0.0,0.0,0.0,1.0,111.0130,0.0,118.5240,2.0,5.0,-1.0,-6.0
2,-12.0,-7.0,-15.0,-13.5,0.0,3.0,-10.0,-9.0,-3.0,0.045,16.0,16.0,0.0,0.0,0.0,1.0,44.2395,0.0,53.9975,10.5,15.0,-3.0,6.0
3,5.0,14.0,14.0,9.5,0.0,1.0,-6.0,-1.0,0.0,-0.045,4.0,4.0,0.0,0.0,0.0,1.0,67.0011,0.0,75.6956,-9.5,-4.0,-20.0,-15.0
4,-11.0,-2.0,-7.0,-9.0,0.0,12.0,-5.0,2.0,7.0,0.051,6.0,6.0,0.0,0.0,0.0,1.0,42.4102,0.0,50.5628,16.0,23.0,-3.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639987,-5.0,-7.0,-7.0,-6.0,0.0,8.0,18.0,2.0,13.0,-0.006,6.0,6.0,0.0,0.0,0.0,1.0,87.8355,0.0,99.4246,19.0,13.0,25.0,9.0
639988,13.0,9.0,9.0,11.0,0.0,19.0,6.0,2.0,12.5,-0.731,-2.0,-2.0,0.0,0.0,0.0,1.0,56.7378,0.0,77.5359,1.5,6.0,-3.0,-7.0
639989,-12.0,-2.0,-11.0,-11.5,0.0,-48000.0,-48000.0,-48000.0,-48000.0,0.000,0.0,0.0,0.0,0.0,0.0,1.0,23.9329,0.0,30.7488,-47988.5,-47988.0,-47998.0,-47989.0
639990,2.0,-4.0,-1.0,-2.5,0.0,3.0,6.0,6.0,4.5,-0.032,12.0,12.0,0.0,0.0,0.0,1.0,127.2580,0.0,138.0780,7.0,1.0,10.0,7.0
