In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import Lars, HuberRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
# Load the pickled dataframe
data_dir = 'C:/Users/ckunde/Desktop/Scripts/A1/ball_size_ml/data'
pickle_path = os.path.join(data_dir, 'combined_data.pkl')

with open(pickle_path, 'rb') as f:
    df = pickle.load(f)

In [10]:
def get_equal_samples(df: pd.DataFrame, datapoints: int = 5000):
    # Ensures equal number of samples for each ball size
    # Get the unique values in the 'Op20PickedBallSize' column
    print('columns: ', df.columns)
    unique_ball_sizes = df['Op20PickedBallSize1'].unique()
    print('unique ball sizes: ', unique_ball_sizes.sort())

    # Check if the number of unique values is greater than 41
    if len(unique_ball_sizes) > 41:
        raise ValueError("The 'Op20PickedBallSize1' column has more than 41 distinct values.")

    # Create a dictionary to store the DataFrames
    dataframes = {}

    # Split the DataFrame into smaller DataFrames based on the unique values
    for ball_size in unique_ball_sizes:
        dataframes[ball_size] = df[df['Op20PickedBallSize1'] == ball_size]
        print(f'Datapoints for {ball_size}: {dataframes[ball_size].shape[0]}')
        if ball_size < -22:
            print(f'ballsize {ball_size}: {dataframes[ball_size].head()}')

    # sample the DataFrames to have equal number of datapoints
    sampled_dataframes = {}
    for ball_size, dataframe in dataframes.items():
        sampled_dataframes[ball_size] = dataframe.sample(n=datapoints, replace=True, random_state=42)
    # Concatenate the sampled DataFrames
    sampled_df = pd.concat(sampled_dataframes.values())
    # shuffle the DataFrame
    sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return sampled_df
    
df_sampled = get_equal_samples(df, 10)

# Define the features to use
features = [
    'Op10Diameter1_4Measurement',
    'Op10Diameter2_5Measurement',
    'Op10Diameter3_6Measurement',
    'Op20Diameter1_4Measurement',
    'Op20Diameter2_5Measurement',
    'Op20Diameter3_6Measurement',
    'Op10DiameterAvgMeasurement',
    'Op20DiameterAvgMeasurement',
    'Op40ForceAverage'
]

# Select the features and target
X = df_sampled[features]
y = df_sampled['Op20PickedBallSize']

# Drop rows with NaN values in X or y
X = X.dropna()
y = y[X.index]

# Define the hyperparameters to tune for the Lars model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


columns:  Index(['Op10Diameter1_4Measurement', 'Op10Diameter2_5Measurement',
       'Op10Diameter3_6Measurement', 'Op10DiameterAvgMeasurement',
       'Op10MasterTempChangeMeasurement', 'Op20Diameter1_4Measurement',
       'Op20Diameter2_5Measurement', 'Op20Diameter3_6Measurement',
       'Op20DiameterAvgMeasurement', 'Op20MasterTempChangeMeasurement',
       'Op20PickedBallSize', 'Op20PickedBallSize1', 'Op35AAssembly1BallSize',
       'Op35AAssembly1BallBinNumber', 'Op35BAssembly2BallSize',
       'Op35BAssembly2BallBinNumber', 'Op40ForceTestNotok', 'Op50ForceTestOk2',
       'Op50ForcePlusData', 'Op40ForceMinusData', 'Op40ForceAverage',
       'DiameterAvgDiff', 'Diameter1_4Diff', 'Diameter2_5Diff',
       'Diameter3_6Diff', 'Op10Diameter1_4Measurement_dev',
       'Op10Diameter2_5Measurement_dev', 'Op10Diameter3_6Measurement_dev',
       'Op20Diameter1_4Measurement_dev', 'Op20Diameter2_5Measurement_dev',
       'Op20Diameter3_6Measurement_dev', 'Op10DiameterAvgMeasurement_dev',
    

In [9]:
df_sampled.head(25)

Unnamed: 0,Op10Diameter1_4Measurement,Op10Diameter2_5Measurement,Op10Diameter3_6Measurement,Op10DiameterAvgMeasurement,Op10MasterTempChangeMeasurement,Op20Diameter1_4Measurement,Op20Diameter2_5Measurement,Op20Diameter3_6Measurement,Op20DiameterAvgMeasurement,Op20MasterTempChangeMeasurement,...,Diameter2_5Diff,Diameter3_6Diff,Op10Diameter1_4Measurement_dev,Op10Diameter2_5Measurement_dev,Op10Diameter3_6Measurement_dev,Op20Diameter1_4Measurement_dev,Op20Diameter2_5Measurement_dev,Op20Diameter3_6Measurement_dev,Op10DiameterAvgMeasurement_dev,Op20DiameterAvgMeasurement_dev
0,47.998,48.001,47.993,47.9955,0.0,48.029,48.013,48.006,48.021,-0.141,...,12.0,13.0,-2.0,1.0,-7.0,29.0,13.0,6.0,-4.5,21.0
1,47.991,48.003,47.994,47.9925,0.0,47.986,47.974,47.97,47.98,0.096,...,-29.0,-24.0,-9.0,3.0,-6.0,-14.0,-26.0,-30.0,-7.5,-20.0
2,48.004,48.01,48.01,48.007,0.0,47.992,47.988,47.994,47.993,0.128,...,-22.0,-16.0,4.0,10.0,10.0,-8.0,-12.0,-6.0,7.0,-7.0
3,47.994,47.999,47.994,47.9965,0.0,48.008,48.005,48.007,48.0075,0.25,...,6.0,13.0,-6.0,-1.0,-6.0,8.0,5.0,7.0,-3.5,7.5
4,48.001,48.007,48.003,48.002,0.0,48.011,48.019,48.011,48.015,0.269,...,12.0,8.0,1.0,7.0,3.0,11.0,19.0,11.0,2.0,15.0
5,47.999,48.005,48.003,48.001,0.0,47.998,47.999,47.992,47.9985,0.276,...,-6.0,-11.0,-1.0,5.0,3.0,-2.0,-1.0,-8.0,1.0,-1.5
6,47.987,47.993,47.991,47.989,0.0,47.99,47.987,47.992,47.991,-0.09,...,-6.0,1.0,-13.0,-7.0,-9.0,-10.0,-13.0,-8.0,-11.0,-9.0
7,47.997,47.997,47.996,47.9965,0.0,48.02,48.012,48.014,48.017,-0.154,...,15.0,18.0,-3.0,-3.0,-4.0,20.0,12.0,14.0,-3.5,17.0
8,47.991,47.998,47.994,47.9925,0.0,48.001,47.994,47.994,47.9975,0.128,...,-4.0,0.0,-9.0,-2.0,-6.0,1.0,-6.0,-6.0,-7.5,-2.5
9,47.999,48.0,47.998,47.9985,0.0,48.017,48.013,48.02,48.0185,0.064,...,13.0,22.0,-1.0,0.0,-2.0,17.0,13.0,20.0,-1.5,18.5


In [12]:

# Define the parameter space
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_depth': [int(x) for x in np.linspace(10, 100, num=10)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt']
}

# Create a random forest classifier
rf = RandomForestClassifier()

# Set up random search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    verbose=3,
    n_iter=100,
    cv=5,
    random_state=42,
    # n_jobs=-1
)

# Fit the random search
random_search.fit(X_train, y_train)

# Get best parameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits




[CV 1/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000;, score=0.226 total time=  29.4s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000;, score=0.229 total time=  27.9s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000;, score=0.224 total time=  26.6s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000;, score=0.225 total time=  27.6s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000;, score=0.223 total time=  27.6s
[CV 1/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.216 total time=  42.3s
[CV 2/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_sample

In [None]:


param_grid_lars = {
    'epsilon': [5, 6, 7, 8, 9],
    'alpha': [0, 1e-15, 1e-12, 1e-9, 1e-6, 1e-3]
}

# Create a Lars model
lars_model = HuberRegressor()

# Perform hyperparameter tuning using GridSearchCV
grid_search_lars = GridSearchCV(lars_model, param_grid_lars, cv=3, scoring='r2', verbose=3, n_jobs=-1)
grid_search_lars.fit(X_train, y_train)

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search_lars.best_params_}')

# Evaluate the best model
best_model = grid_search_lars.best_estimator_
y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Root mean squared error: {rmse} (improvement: {int(100 * (1-(rmse/37.4179)))}%)')
print(f'R2 score: {r2}')

# Print feature importances
print('Feature importances:')
for feature, importance in zip(features, best_model.coef_):
    print(f'{feature}: {importance}')

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [38]:
X

Unnamed: 0,Op10Diameter1_4Measurement,Op10Diameter2_5Measurement,Op10Diameter3_6Measurement,Op20Diameter1_4Measurement,Op20Diameter2_5Measurement,Op20Diameter3_6Measurement
0,1.0,3.0,4.0,-5.0,-10.0,-14.0
1,-2.0,6.0,8.0,3.0,5.0,2.0
2,-12.0,-7.0,-15.0,3.0,-10.0,-9.0
3,5.0,14.0,14.0,1.0,-6.0,-1.0
4,-11.0,-2.0,-7.0,12.0,-5.0,2.0
...,...,...,...,...,...,...
639987,-5.0,-7.0,-7.0,8.0,18.0,2.0
639988,13.0,9.0,9.0,19.0,6.0,2.0
639989,-12.0,-2.0,-11.0,-48000.0,-48000.0,-48000.0
639990,2.0,-4.0,-1.0,3.0,6.0,6.0


In [3]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

# Create a random forest regressor model
model = GradientBoostingRegressor(random_state=42)

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', verbose=3, n_jobs= -1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean squared error: {mse}')
print(f'R2 score: {r2}')

# Print feature importances
print('Feature importances:')
for feature, importance in zip(features, best_model.feature_importances_):
    print(f'{feature}: {importance}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 100}
Mean squared error: 14.721951068777516
R2 score: 0.4111787673151007
Feature importances:
Op10Diameter1_4Measurement: 0.028979986637317462
Op10Diameter2_5Measurement: 0.0251081013831695
Op10Diameter3_6Measurement: 0.023403104140223438
Op20Diameter1_4Measurement: 0.5008143014711505
Op20Diameter2_5Measurement: 0.1507582838052862
Op20Diameter3_6Measurement: 0.13205974798256626
Op10MasterTempChangeMeasurement: 0.0
Op20MasterTempChangeMeasurement: 0.037603250569199734
Op40ForceAverage: 0.10127322401108683


In [24]:
df

Unnamed: 0,Op10Diameter1_4Measurement,Op10Diameter2_5Measurement,Op10Diameter3_6Measurement,Op10DiameterAvgMeasurement,Op10MasterTempChangeMeasurement,Op20Diameter1_4Measurement,Op20Diameter2_5Measurement,Op20Diameter3_6Measurement,Op20DiameterAvgMeasurement,Op20MasterTempChangeMeasurement,Op20PickedBallSize,Op20PickedBallSize1,Op35AAssembly1BallSize,Op35BAssembly2BallSize,Op40ForceTestNotok,Op50ForceTestOk2,Op50ForcePlusData,Op40ForceMinusData,Op40ForceAverage,DiameterAvgDiff,Diameter1_4Diff,Diameter2_5Diff,Diameter3_6Diff
0,1.0,3.0,4.0,2.0,0.0,-5.0,-10.0,-14.0,-7.5,0.045,14.0,14.0,0.0,0.0,0.0,1.0,108.4270,0.0,117.3480,-9.5,-6.0,-13.0,-18.0
1,-2.0,6.0,8.0,2.0,0.0,3.0,5.0,2.0,4.0,0.006,0.0,0.0,0.0,0.0,0.0,1.0,111.0130,0.0,118.5240,2.0,5.0,-1.0,-6.0
2,-12.0,-7.0,-15.0,-13.5,0.0,3.0,-10.0,-9.0,-3.0,0.045,16.0,16.0,0.0,0.0,0.0,1.0,44.2395,0.0,53.9975,10.5,15.0,-3.0,6.0
3,5.0,14.0,14.0,9.5,0.0,1.0,-6.0,-1.0,0.0,-0.045,4.0,4.0,0.0,0.0,0.0,1.0,67.0011,0.0,75.6956,-9.5,-4.0,-20.0,-15.0
4,-11.0,-2.0,-7.0,-9.0,0.0,12.0,-5.0,2.0,7.0,0.051,6.0,6.0,0.0,0.0,0.0,1.0,42.4102,0.0,50.5628,16.0,23.0,-3.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639987,-5.0,-7.0,-7.0,-6.0,0.0,8.0,18.0,2.0,13.0,-0.006,6.0,6.0,0.0,0.0,0.0,1.0,87.8355,0.0,99.4246,19.0,13.0,25.0,9.0
639988,13.0,9.0,9.0,11.0,0.0,19.0,6.0,2.0,12.5,-0.731,-2.0,-2.0,0.0,0.0,0.0,1.0,56.7378,0.0,77.5359,1.5,6.0,-3.0,-7.0
639989,-12.0,-2.0,-11.0,-11.5,0.0,-48000.0,-48000.0,-48000.0,-48000.0,0.000,0.0,0.0,0.0,0.0,0.0,1.0,23.9329,0.0,30.7488,-47988.5,-47988.0,-47998.0,-47989.0
639990,2.0,-4.0,-1.0,-2.5,0.0,3.0,6.0,6.0,4.5,-0.032,12.0,12.0,0.0,0.0,0.0,1.0,127.2580,0.0,138.0780,7.0,1.0,10.0,7.0
