In [None]:
# preprocessing_and_training.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [2]:
# Load
df = pd.read_csv("manufacturing_dataset_1000_samples.csv")


In [3]:
# Basic checks
print(df.shape)
print(df.dtypes)
print(df.isna().sum())


(1000, 19)
Timestamp                      object
Injection_Temperature         float64
Injection_Pressure            float64
Cycle_Time                    float64
Cooling_Time                  float64
Material_Viscosity            float64
Ambient_Temperature           float64
Machine_Age                   float64
Operator_Experience           float64
Maintenance_Hours               int64
Shift                          object
Machine_Type                   object
Material_Grade                 object
Day_of_Week                    object
Temperature_Pressure_Ratio    float64
Total_Cycle_Time              float64
Efficiency_Score              float64
Machine_Utilization           float64
Parts_Per_Hour                float64
dtype: object
Timestamp                      0
Injection_Temperature          0
Injection_Pressure             0
Cycle_Time                     0
Cooling_Time                   0
Material_Viscosity            20
Ambient_Temperature           20
Machine_Age           

In [5]:
# Target
target = "Parts_Per_Hour"
X = df.drop(columns=[target])
y = df[target]
print(X.shape, y.shape)

(1000, 18) (1000,)


In [6]:
# Define columns (update names if different)
numeric_features = [
    "Injection_Temperature", "Injection_Pressure", "Cycle_Time",
    "Cooling_Time", "Material_Viscosity", "Ambient_Temperature",
    "Machine_Age", "Maintenance_Hours", "Temperature_Pressure_Ratio",
    "Total_Cycle_Time", "Efficiency_Score", "Machine_Utilization"
]
categorical_features = ["Shift", "Machine_Type", "Material_Grade", "Day_of_Week"]


In [7]:
# If Operator_Experience is numeric; else treat as categorical
if "Operator_Experience" in X.columns:
    numeric_features.append("Operator_Experience")

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [8]:
# Model pipeline
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X.shape, y.shape)

(1000, 18) (1000,)


In [10]:
# Fit baseline
pipe.fit(X_train, y_train)


In [11]:

# Predict & evaluate
y_pred = pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mse  = mean_squared_error(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"Baseline LinearRegression -> RMSE: {rmse:.3f}, MSE: {mse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")


Baseline LinearRegression -> RMSE: 3.508, MSE: 12.305, MAE: 2.717, R2: 0.906




In [12]:
# Try Ridge with GridSearch
param_grid = {"regressor__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]}
ridge_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", Ridge())])
grid = GridSearchCV(ridge_pipe, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
grid.fit(X_train, y_train)
print("Best Ridge params:", grid.best_params_)
best_model = grid.best_estimator_

Best Ridge params: {'regressor__alpha': 0.01}


In [13]:
# Evaluate best model
y_pred_best = best_model.predict(X_test)
print("Ridge -> RMSE:", mean_squared_error(y_test, y_pred_best, squared=False))
print("Ridge -> R2:", r2_score(y_test, y_pred_best))


Ridge -> RMSE: 3.5077820347526827
Ridge -> R2: 0.9057072105720209




In [14]:

# Save pipeline
joblib.dump(best_model, "model_pipeline.joblib")
print("Saved model_pipeline.joblib")

Saved model_pipeline.joblib
