In [32]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset
df = pd.read_csv("scheduling_results_20241219_test_version.csv", header=None)
df = df[0].str.split(';', expand=True)

# Assign proper column names
df.columns = ['instance', 'jobs', 'machines', 'strategy', 'tw', 'compression', 
              'overlapping', 'interrupted_calls', 'makespan', 'gap_to_opt_percent', 'opt_class']

# Convert numeric columns to float
#numeric_cols = ['jobs', 'machines', 'tw', 'interrupted_calls', 'gap_to_opt_percent']
#df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Handle categorical columns using one-hot encoding
x = df[['jobs', 'machines', 'strategy', 'tw', 'compression', 'overlapping', 'interrupted_calls']]
x = pd.get_dummies(x)

x = x.astype(float)
#print(x)

# Add a constant term for the intercept
x = sm.add_constant(x)

# Target variable
y = df['gap_to_opt_percent']

# Ensure target variable is numeric
y = pd.to_numeric(y, errors='coerce')

# Drop rows with missing or invalid data
#x = x.dropna()
#y = y[x.index]  # Ensure matching indices between x and y

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the OLS model using statsmodels
model = sm.OLS(y_train, x_train).fit()

# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:     gap_to_opt_percent   R-squared:                       0.257
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     47.64
Date:                Mon, 13 Jan 2025   Prob (F-statistic):          2.78e-216
Time:                        18:08:28   Log-Likelihood:                -19762.
No. Observations:                3744   AIC:                         3.958e+04
Df Residuals:                    3716   BIC:                         3.975e+04
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.209e+13 