In [1]:
# Load the pickled dataframe
import os
import pickle
from sklearn.preprocessing import StandardScaler


In [2]:
data_dir = 'C:/Users/ckunde/Desktop/Scripts/A1/ball_size_ml/data'
pickle_path = os.path.join(data_dir, 'combined_data.pkl')

with open(pickle_path, 'rb') as f:
    df = pickle.load(f)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [4]:

# Define the features to use
features = [
    'Op10Diameter1_4Measurement',
    'Op10Diameter2_5Measurement',
    'Op10Diameter3_6Measurement',
    'Op20Diameter1_4Measurement',
    'Op20Diameter2_5Measurement',
    'Op20Diameter3_6Measurement',
    'Op10MasterTempChangeMeasurement',
    'Op20MasterTempChangeMeasurement',
    'Op40ForceAverage'
]

# Select the features and target
X = df[features]
y = df['Op20PickedBallSize']

# Drop rows with NaN values in X or y
X = X.dropna()
y = y[X.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
X_train

Unnamed: 0,Op10Diameter1_4Measurement,Op10Diameter2_5Measurement,Op10Diameter3_6Measurement,Op20Diameter1_4Measurement,Op20Diameter2_5Measurement,Op20Diameter3_6Measurement,Op10MasterTempChangeMeasurement,Op20MasterTempChangeMeasurement,Op40ForceAverage
210011,47.994,47.998,47.998,48.009,48.002,47.996,0.0,0.000,60.6442
307948,47.998,48.004,48.005,48.000,47.975,47.987,0.0,0.109,96.1445
194537,47.992,47.996,47.997,47.998,47.997,48.001,0.0,0.154,142.0070
479675,47.998,48.000,47.998,48.008,48.000,48.005,0.0,0.077,55.5393
262981,47.998,47.995,47.992,48.006,48.005,48.004,0.0,-0.109,129.3630
...,...,...,...,...,...,...,...,...,...
112534,48.005,48.008,48.005,48.007,48.003,47.992,0.0,-0.224,89.3699
265843,47.987,47.993,47.993,48.004,48.008,48.005,0.0,0.090,82.3554
374552,48.000,48.001,48.003,48.007,48.008,48.014,0.0,0.096,78.7515
134701,48.005,48.005,48.007,48.013,48.009,48.020,0.0,0.237,106.0570


In [6]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# List of models to compare
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest Regressor', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR())
]

# Train and evaluate the models
for name, model in models:
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'{name}:')
    print(f'Mean squared error: {mse}')
    print(f'R2 score: {r2}')
    print('')

Linear Regression:
Mean squared error: 23.63308343093876
R2 score: 0.06747708470328073

Random Forest Regressor:
Mean squared error: 12.416556828867884
R2 score: 0.5100629248892161

