In [9]:
# Load the pickled dataframe
import os
import pickle
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [10]:
data_dir = 'C:/Users/chris/OneDrive/Projekte/IFA/Scripts/A1/ball_size_ml/data'
pickle_path = os.path.join(data_dir, 'combined_data.pkl')

with open(pickle_path, 'rb') as f:
    df = pickle.load(f)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge, Lasso, LassoLars, ElasticNet, BayesianRidge, Lars, OrthogonalMatchingPursuit, PassiveAggressiveRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor

from sklearn.metrics import root_mean_squared_error, r2_score

import time


In [33]:
test_features = [
    [
        'DiameterAvgDiff',
        'Op20PickedBallSize'
    ],
    [
        'Diameter1_4Diff',
        'Diameter2_5Diff',
        'Diameter3_6Diff',
        'Op20PickedBallSize'
    ],
    [
        'Op10Diameter1_4Measurement',
        'Op10Diameter2_5Measurement',
        'Op10Diameter3_6Measurement',
        'Op20Diameter1_4Measurement',
        'Op20Diameter2_5Measurement',
        'Op20Diameter3_6Measurement',
        # 'Diameter1_4Diff',
        # 'Diameter2_5Diff',
        # 'Diameter3_6Diff',
        # 'DiameterAvgDiff',
        # 'Op10MasterTempChangeMeasurement',
        # 'Op20MasterTempChangeMeasurement',
        'Op20PickedBallSize'
    ]
]

# List of models to compare
models = [
    # ('Linear Regression', LinearRegression()),
    # ('Ridge Regression', Ridge()),
    # ('Lasso Regression', Lasso()),
    # ('Bayesian Ridge Regression', BayesianRidge()),
    # ('Lars Regression', Lars()),
    ('Random Forest Regressor', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR()),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    # ('K-Nearest Neighbors Regressor', KNeighborsRegressor(n_neighbors=5)),
    # ('Decision Tree Regressor', DecisionTreeRegressor(random_state=42)),
    ('AdaBoost Regressor', AdaBoostRegressor(n_estimators=100, random_state=42)),
    ('MLP Regressor', MLPRegressor(random_state=42)),
    # ('ElasticNet Regression', ElasticNet(random_state=42)),
    # ('BayesianRidge Regression', BayesianRidge()),
    # ('LassoLars Regression', LassoLars()),
    # ('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()),
    # ('PassiveAggressiveRegressor', PassiveAggressiveRegressor()),
    # ('Kernel Ridge Regression', KernelRidge()),
    # ('Gaussian Process Regression', GaussianProcessRegressor()),
    ('XGBoost Regressor', XGBRegressor()),
]

# List of models to compare
linear_models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet Regression', ElasticNet()),
    ('Bayesian Ridge Regression', BayesianRidge()),
    ('Lars Regression', Lars()),
    ('BayesianRidge Regression', BayesianRidge()),
    ('LassoLars Regression', LassoLars()),
    ('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()),
    ('PassiveAggressiveRegressor', PassiveAggressiveRegressor()),
    # ('Quantile Regressor', QuantileRegressor()),
    ('Huber Regressor', HuberRegressor()),
]

In [40]:

# Take a 1% sample of the data
df_sample = df.sample(frac=0.01, random_state=42)
results = []

# Select the features and target
for idx, features in enumerate(test_features):
    X = df[features]
    y = df['Op40ForceAverage']
    

    # Drop rows with NaN values in X or y
    X = X.dropna()
    y = y[X.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate the models
    for name, model in linear_models:
        start_time = time.time()
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions on the testing set
        y_pred = model.predict(X_test)

        # Evaluate the model
        mse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Append the results to the dataframe
        duration = time.time() - start_time
        result_dict = {'Feature Set': idx,'Model': name, 'MSE': mse, 'R2': r2, 'Duration': duration}
        print(result_dict)
        results.append(result_dict)
    
results_df = pd.DataFrame(results)

{'Feature Set': 0, 'Model': 'Linear Regression', 'MSE': 35.90489979007643, 'R2': 0.045649889449033365, 'Duration': 0.01904010772705078}
{'Feature Set': 0, 'Model': 'Ridge Regression', 'MSE': 35.90489965732554, 'R2': 0.04564989650605522, 'Duration': 0.01130533218383789}
{'Feature Set': 0, 'Model': 'Lasso Regression', 'MSE': 35.925513454829286, 'R2': 0.04455375490654989, 'Duration': 0.016084671020507812}
{'Feature Set': 0, 'Model': 'ElasticNet Regression', 'MSE': 36.02045546647034, 'R2': 0.03949707823488213, 'Duration': 0.017313003540039062}
{'Feature Set': 0, 'Model': 'Bayesian Ridge Regression', 'MSE': 35.90489477216617, 'R2': 0.045650156200530034, 'Duration': 0.026546955108642578}
{'Feature Set': 0, 'Model': 'Lars Regression', 'MSE': 35.904899790076435, 'R2': 0.045649889449033254, 'Duration': 0.013611555099487305}
{'Feature Set': 0, 'Model': 'BayesianRidge Regression', 'MSE': 35.90489477216617, 'R2': 0.045650156200530034, 'Duration': 0.02628183364868164}
{'Feature Set': 0, 'Model': 'L

In [41]:
results_df.sort_values(by='R2', ascending=False)

Unnamed: 0,Feature Set,Model,MSE,R2,Duration
32,2,Huber Regressor,33.544867,0.166986,2.484486
31,2,PassiveAggressiveRegressor,33.722177,0.158156,1.25535
26,2,Bayesian Ridge Regression,34.476233,0.120087,0.08401
28,2,BayesianRidge Regression,34.476233,0.120087,0.085693
22,2,Linear Regression,34.476237,0.120086,0.055519
27,2,Lars Regression,34.476237,0.120086,0.029846
23,2,Ridge Regression,34.495466,0.119105,0.031324
16,1,Lars Regression,35.464117,0.068938,0.018113
11,1,Linear Regression,35.464117,0.068938,0.031594
15,1,Bayesian Ridge Regression,35.464118,0.068938,0.050202


In [6]:
measurements = [
    'Op10Diameter1_4Measurement',
    'Op10Diameter2_5Measurement',
    'Op10Diameter3_6Measurement',
    'Op20Diameter1_4Measurement',
    'Op20Diameter2_5Measurement',
    'Op20Diameter3_6Measurement',
    'Op10DiameterAvgMeasurement',
    'Op20DiameterAvgMeasurement'
]
for measure in measurements:
    df[measure] = (df[measure] - 48) * 1000

with open(pickle_path, 'wb') as f:
    pickle.dump(df, f)