In [3]:
# Load the pickled dataframe
import os
import pickle
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [14]:
data_dir = 'C:/Users/ckunde/Desktop/Scripts/A1/ball_size_ml/data'
pickle_path = os.path.join(data_dir, 'combined_data.pkl')

with open(pickle_path, 'rb') as f:
    df = pickle.load(f)

In [23]:
df.columns

Index(['Op10Diameter1_4Measurement', 'Op10Diameter2_5Measurement',
       'Op10Diameter3_6Measurement', 'Op10DiameterAvgMeasurement',
       'Op10MasterTempChangeMeasurement', 'Op20Diameter1_4Measurement',
       'Op20Diameter2_5Measurement', 'Op20Diameter3_6Measurement',
       'Op20DiameterAvgMeasurement', 'Op20MasterTempChangeMeasurement',
       'Op20PickedBallSize', 'Op20PickedBallSize1', 'Op35AAssembly1BallSize',
       'Op35BAssembly2BallSize', 'Op40ForceTestNotok', 'Op50ForceTestOk2',
       'Op50ForcePlusData', 'Op40ForceMinusData', 'Op40ForceAverage',
       'DiameterAvgDiff', 'Diameter1_4Diff', 'Diameter2_5Diff',
       'Diameter3_6Diff', 'Op10Diameter1_4Measurement_dev',
       'Op10Diameter2_5Measurement_dev', 'Op10Diameter3_6Measurement_dev',
       'Op20Diameter1_4Measurement_dev', 'Op20Diameter2_5Measurement_dev',
       'Op20Diameter3_6Measurement_dev', 'Op10DiameterAvgMeasurement_dev',
       'Op20DiameterAvgMeasurement_dev'],
      dtype='object')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, HuberRegressor, Ridge, Lasso, LassoLars, ElasticNet, BayesianRidge, Lars, OrthogonalMatchingPursuit, PassiveAggressiveRegressor
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.metrics import root_mean_squared_error, r2_score

import time


In [20]:
test_features = [
    [
        'DiameterAvgDiff',
        'Op40ForceAverage'
    ],
    [
        'Diameter1_4Diff',
        'Diameter2_5Diff',
        'Diameter3_6Diff',
        'Op40ForceAverage'
    ],
    [
        'Op10Diameter1_4Measurement',
        'Op10Diameter2_5Measurement',
        'Op10Diameter3_6Measurement',
        'Op20Diameter1_4Measurement',
        'Op20Diameter2_5Measurement',
        'Op20Diameter3_6Measurement',
        'Op10DiameterAvgMeasurement',
        'Op20DiameterAvgMeasurement',
        'Op40ForceAverage'
    ],
    [
        'Op10Diameter1_4Measurement_dev',
        'Op10Diameter2_5Measurement_dev',
        'Op10Diameter3_6Measurement_dev',
        'Op20Diameter1_4Measurement_dev',
        'Op20Diameter2_5Measurement_dev',
        'Op20Diameter3_6Measurement_dev',
        'Op40ForceAverage'
    ],
    [
        'Diameter1_4Diff',
        'Diameter2_5Diff',
        'Diameter3_6Diff',
        'DiameterAvgDiff',
        # 'Op10MasterTempChangeMeasurement',
        # 'Op20MasterTempChangeMeasurement',
        'Op40ForceAverage'
    ]
]

# List of models to compare
regression_models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Bayesian Ridge Regression', BayesianRidge()),
    ('Lars Regression', Lars()),
    ('Random Forest Regressor', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR()),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('K-Nearest Neighbors Regressor', KNeighborsRegressor(n_neighbors=5)),
    ('Decision Tree Regressor', DecisionTreeRegressor(random_state=42)),
    ('AdaBoost Regressor', AdaBoostRegressor(n_estimators=100, random_state=42)),
    ('MLP Regressor', MLPRegressor(random_state=42)),
    ('ElasticNet Regression', ElasticNet(random_state=42)),
    ('BayesianRidge Regression', BayesianRidge()),
    ('LassoLars Regression', LassoLars()),
    ('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()),
    ('PassiveAggressiveRegressor', PassiveAggressiveRegressor()),
    ('Kernel Ridge Regression', KernelRidge()),
    ('Gaussian Process Regression', GaussianProcessRegressor()),
    ('XGBoost Regressor', XGBRegressor()),
]

# List of models to compare
linear_models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet Regression', ElasticNet()),
    ('Bayesian Ridge Regression', BayesianRidge()),
    ('Lars Regression', Lars()),
    ('BayesianRidge Regression', BayesianRidge()),
    ('LassoLars Regression', LassoLars()),
    ('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()),
    ('PassiveAggressiveRegressor', PassiveAggressiveRegressor()),
    ('Huber Regressor', HuberRegressor()),
]

# Define the list of classification models
classification_models = [
    ('Logistic Regression', LogisticRegression()),
    ('SVC', SVC()),
    ('LinearSVC', LinearSVC()),
    ('KNeighbors Classifier', KNeighborsClassifier()),
    ('Decision Tree Classifier', DecisionTreeClassifier(random_state=42)),
    ('Random Forest Classifier', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting Classifier', GradientBoostingClassifier(random_state=42)),
    ('GaussianNB', GaussianNB()),
    ('BernoulliNB', BernoulliNB()),
    ('MLPClassifier', MLPClassifier(random_state=42)),
    ('AdaBoostClassifier', AdaBoostClassifier(random_state=42))
]

In [21]:

# Take a 1% sample of the data
df_s = df.sample(frac=0.01, random_state=42)
results = []

# Select the features and target
for idx, features in enumerate(test_features):
    X = df_s[features]
    y = df_s['Op20PickedBallSize']
    

    # Drop rows with NaN values in X or y
    X = X.dropna()
    y = y[X.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate the models
    for name, model in classification_models:
        start_time = time.time()
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions on the testing set
        y_pred = model.predict(X_test)

        # Evaluate the model
        mse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Append the results to the dataframe
        duration = time.time() - start_time
        result_dict = {'Feature Set': idx,'Model': name, 'MSE': mse, 'R2': r2, 'Duration': duration}
        print(result_dict)
        results.append(result_dict)
    
results_df = pd.DataFrame(results)

{'Feature Set': 0, 'Model': 'Logistic Regression', 'MSE': 5.020436323318325, 'R2': -0.008093973136834132, 'Duration': 0.142411470413208}
{'Feature Set': 0, 'Model': 'SVC', 'MSE': 5.0223405283264215, 'R2': -0.008858839580792255, 'Duration': 1.9518046379089355}
{'Feature Set': 0, 'Model': 'LinearSVC', 'MSE': 5.03153402665177, 'R2': -0.012555694059923184, 'Duration': 0.03356504440307617}
{'Feature Set': 0, 'Model': 'KNeighbors Classifier', 'MSE': 6.3512084366345345, 'R2': -0.6133582857890456, 'Duration': 0.06911182403564453}
{'Feature Set': 0, 'Model': 'Decision Tree Classifier', 'MSE': 5.947445800827853, 'R2': -0.4147479658412474, 'Duration': 0.04040265083312988}
{'Feature Set': 0, 'Model': 'Random Forest Classifier', 'MSE': 5.843101288641393, 'R2': -0.3655415579466068, 'Duration': 0.9433324337005615}
{'Feature Set': 0, 'Model': 'Gradient Boosting Classifier', 'MSE': 5.096674957021477, 'R2': -0.03894358637647932, 'Duration': 7.323171138763428}
{'Feature Set': 0, 'Model': 'GaussianNB', 'M

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Feature Set': 2, 'Model': 'Logistic Regression', 'MSE': 5.049555224782425, 'R2': -0.019821925277525576, 'Duration': 0.2909576892852783}
{'Feature Set': 2, 'Model': 'SVC', 'MSE': 5.014083750454684, 'R2': -0.005544418323640166, 'Duration': 2.2299065589904785}
{'Feature Set': 2, 'Model': 'LinearSVC', 'MSE': 5.02138851608617, 'R2': -0.008476406358813193, 'Duration': 0.10949373245239258}
{'Feature Set': 2, 'Model': 'KNeighbors Classifier', 'MSE': 6.78103754004665, 'R2': -0.8391213644973579, 'Duration': 0.06422972679138184}
{'Feature Set': 2, 'Model': 'Decision Tree Classifier', 'MSE': 5.3513970324769575, 'R2': -0.14538749982732102, 'Duration': 0.04215264320373535}
{'Feature Set': 2, 'Model': 'Random Forest Classifier', 'MSE': 4.413308377697324, 'R2': 0.2209835268286301, 'Duration': 1.2952420711517334}
{'Feature Set': 2, 'Model': 'Gradient Boosting Classifier', 'MSE': 4.284689689703537, 'R2': 0.2657282138001815, 'Duration': 11.369055271148682}
{'Feature Set': 2, 'Model': 'GaussianNB', 'MSE



{'Feature Set': 2, 'Model': 'MLPClassifier', 'MSE': 4.976758733356465, 'R2': 0.00937047733354357, 'Duration': 7.9029669761657715}
{'Feature Set': 2, 'Model': 'AdaBoostClassifier', 'MSE': 4.624204464799423, 'R2': 0.14475183791413504, 'Duration': 0.23708057403564453}
{'Feature Set': 3, 'Model': 'Logistic Regression', 'MSE': 5.05302560146727, 'R2': -0.021224180424782135, 'Duration': 0.3163599967956543}
{'Feature Set': 3, 'Model': 'SVC', 'MSE': 5.0258296970178264, 'R2': -0.010261094728048814, 'Duration': 2.1672613620758057}
{'Feature Set': 3, 'Model': 'LinearSVC', 'MSE': 5.04545079305896, 'R2': -0.018164714648949642, 'Duration': 0.0597681999206543}
{'Feature Set': 3, 'Model': 'KNeighbors Classifier', 'MSE': 6.817600844498041, 'R2': -0.8590078920402695, 'Duration': 0.05952811241149902}
{'Feature Set': 3, 'Model': 'Decision Tree Classifier', 'MSE': 5.387309863904351, 'R2': -0.16081230644714362, 'Duration': 0.03639674186706543}
{'Feature Set': 3, 'Model': 'Random Forest Classifier', 'MSE': 4.



{'Feature Set': 3, 'Model': 'MLPClassifier', 'MSE': 4.961364675392354, 'R2': 0.015489408885208666, 'Duration': 7.791938066482544}
{'Feature Set': 3, 'Model': 'AdaBoostClassifier', 'MSE': 5.383758961707788, 'R2': -0.15928257355922715, 'Duration': 0.2095937728881836}
{'Feature Set': 4, 'Model': 'Logistic Regression', 'MSE': 5.020436323318325, 'R2': -0.008093973136834132, 'Duration': 0.19989490509033203}
{'Feature Set': 4, 'Model': 'SVC', 'MSE': 5.0410268984835875, 'R2': -0.01638002627971402, 'Duration': 1.535480260848999}
{'Feature Set': 4, 'Model': 'LinearSVC', 'MSE': 5.024244011634607, 'R2': -0.009623706024750378, 'Duration': 0.05208992958068848}
{'Feature Set': 4, 'Model': 'KNeighbors Classifier', 'MSE': 6.3577289287507455, 'R2': -0.6166727070461975, 'Duration': 0.04810500144958496}
{'Feature Set': 4, 'Model': 'Decision Tree Classifier', 'MSE': 6.140973358097278, 'R2': -0.5083166274854605, 'Duration': 0.03186535835266113}
{'Feature Set': 4, 'Model': 'Random Forest Classifier', 'MSE': 

In [22]:
results_df.sort_values(by='R2', ascending=False)

Unnamed: 0,Feature Set,Model,MSE,R2,Duration
28,2,Gradient Boosting Classifier,4.28469,0.265728,11.369055
39,3,Gradient Boosting Classifier,4.352222,0.2424,11.100575
27,2,Random Forest Classifier,4.413308,0.220984,1.295242
38,3,Random Forest Classifier,4.492402,0.192811,1.037464
40,3,GaussianNB,4.541794,0.174964,0.004673
32,2,AdaBoostClassifier,4.624204,0.144752,0.237081
7,0,GaussianNB,4.709235,0.11301,0.0065
29,2,GaussianNB,4.787101,0.083435,0.004271
42,3,MLPClassifier,4.961365,0.015489,7.791938
31,2,MLPClassifier,4.976759,0.00937,7.902967


In [17]:
results_df.sort_values(by='R2', ascending=False)

Unnamed: 0,Feature Set,Model,MSE,R2,Duration
6,0,Gradient Boosting Classifier,4.28469,0.265728,13.624242
17,1,Gradient Boosting Classifier,4.352222,0.2424,11.533186
5,0,Random Forest Classifier,4.413308,0.220984,1.533731
16,1,Random Forest Classifier,4.492402,0.192811,1.149933
18,1,GaussianNB,4.541794,0.174964,0.007539
10,0,AdaBoostClassifier,4.624204,0.144752,0.282164
7,0,GaussianNB,4.787101,0.083435,0.008567
20,1,MLPClassifier,4.961365,0.015489,9.153794
9,0,MLPClassifier,4.976759,0.00937,10.07403
8,0,BernoulliNB,4.984438,0.006311,0.006005
