# Data Science Capstone Project: Manufacturing Equipment Output Prediction with Linear Regression

In [13]:
import os
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score, f1_score
import joblib

In [14]:
pip install "numpy<2.0" --force-reinstall

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement numpy<2.0 (from versions: none)
ERROR: No matching distribution found for numpy<2.0


In [15]:
df=pd.read_csv(r"C:\Users\spandana.s\OneDrive\Desktop\MUFG\manufacturing_dataset_1000_samples.csv")
df

Unnamed: 0,Timestamp,Injection_Temperature,Injection_Pressure,Cycle_Time,Cooling_Time,Material_Viscosity,Ambient_Temperature,Machine_Age,Operator_Experience,Maintenance_Hours,Shift,Machine_Type,Material_Grade,Day_of_Week,Temperature_Pressure_Ratio,Total_Cycle_Time,Efficiency_Score,Machine_Utilization,Parts_Per_Hour
0,2023-01-01 00:00:00,221.0,136.0,28.7,13.6,375.5,28.0,3.8,11.2,64,Evening,Type_B,Economy,Thursday,1.625,42.3,0.063,0.510,36.5
1,2023-01-01 01:00:00,213.3,128.9,34.5,14.0,215.8,22.6,6.8,6.3,58,Night,Type_A,Standard,Wednesday,1.655,48.5,0.037,0.389,29.9
2,2023-01-01 02:00:00,222.8,115.9,19.9,9.5,307.0,25.3,4.2,9.6,47,Day,Type_A,Standard,Monday,1.922,29.4,0.061,0.551,56.9
3,2023-01-01 03:00:00,233.3,105.3,39.2,13.1,137.8,26.0,9.2,8.6,49,Evening,Type_A,Premium,Saturday,2.215,52.3,0.054,0.293,31.0
4,2023-01-01 04:00:00,212.2,125.5,45.0,9.9,298.2,23.6,6.2,23.0,49,Night,Type_B,Premium,Monday,1.691,54.9,0.145,0.443,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-02-11 11:00:00,211.6,131.1,39.2,10.2,400.0,26.2,10.4,3.3,32,Day,Type_C,Standard,Saturday,1.615,49.5,0.023,0.257,15.0
996,2023-02-11 12:00:00,236.6,114.6,22.8,11.3,348.3,21.2,12.0,67.6,43,Evening,Type_B,Standard,Thursday,2.064,34.2,0.442,0.159,49.6
997,2023-02-11 13:00:00,222.7,101.8,21.5,8.0,254.9,22.7,8.4,57.5,46,Evening,Type_C,Standard,Saturday,2.188,29.5,0.369,0.337,44.2
998,2023-02-11 14:00:00,300.0,112.6,26.7,10.2,122.6,21.1,9.3,120.0,48,Day,Type_A,Standard,Monday,1.849,36.8,0.760,0.291,57.7


In [16]:
for ts_col in ['Timestamp','timestamp','Time','Datetime','Date']:
    if ts_col in df.columns:
        print('Dropping timestamp-like column:', ts_col)
        df = df.drop(columns=[ts_col])


TARGET = 'Parts_Per_Hour'



categorical_cols = [c for c in ['Shift','Machine_Type','Material_Grade','Day_of_Week'] if c in df.columns]
print('Categorical columns detected:', categorical_cols)



numeric_cols = [c for c in df.select_dtypes(include=['int64','float64']).columns.tolist() if c != TARGET]
numeric_cols = [c for c in numeric_cols if c not in categorical_cols]
print('Numeric columns detected:', numeric_cols)

Dropping timestamp-like column: Timestamp
Categorical columns detected: ['Shift', 'Machine_Type', 'Material_Grade', 'Day_of_Week']
Numeric columns detected: ['Injection_Temperature', 'Injection_Pressure', 'Cycle_Time', 'Cooling_Time', 'Material_Viscosity', 'Ambient_Temperature', 'Machine_Age', 'Operator_Experience', 'Maintenance_Hours', 'Temperature_Pressure_Ratio', 'Total_Cycle_Time', 'Efficiency_Score', 'Machine_Utilization']


In [17]:

X = df.drop(columns=[TARGET])
y = df[TARGET]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (800, 17) Test shape: (200, 17)


In [18]:
 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])





def make_pipeline(estimator):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('model', estimator)
])

In [19]:

models = {
    'LinearRegression': make_pipeline(LinearRegression()),
    'Ridge': make_pipeline(Ridge(random_state=42)),
    'Lasso': make_pipeline(Lasso(random_state=42, max_iter=10000)),
    'RandomForest': make_pipeline(RandomForestRegressor(random_state=42, n_jobs=-1))
}
try:
    import xgboost as xgb
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

In [20]:

results = {}
for name, pipe in models.items():
    print('Training model:', name)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores).mean()
    results[name] = {
        'pipeline': pipe,
        'rmse': rmse,
        'mse': mse,
        'mae': mae,
        'r2': r2,
        'cv_rmse': cv_rmse
}
print(f"{name} -> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, CV_RMSE: {cv_rmse:.4f}")

Training model: LinearRegression
Training model: Ridge
Training model: Lasso
Training model: RandomForest
RandomForest -> RMSE: 4.3049, MAE: 3.3474, R2: 0.8580, CV_RMSE: 4.3261


In [21]:
DATA_PATH = '/mnt/data/manufacturing_dataset_1000_samples.csv'
OUTPUT_DIR = '/mnt/data/capstone_output'
MODEL_PATH = os.path.join(OUTPUT_DIR, 'best_model_pipeline.joblib')
FEATURES_PATH = os.path.join(OUTPUT_DIR, 'feature_columns.npy')
REPORT_PATH = os.path.join(OUTPUT_DIR, 'model_evaluation_report.txt')
DEPLOY_DIR = os.path.join(OUTPUT_DIR, 'deployment')


os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(DEPLOY_DIR, exist_ok=True)

In [22]:

best_model_name = min(results.keys(), key=lambda k: results[k]['rmse'])
best = results[best_model_name]
print('Best model:', best_model_name, 'with test RMSE =', best['rmse'])



joblib.dump(best['pipeline'], MODEL_PATH)
print('Saved best pipeline to', MODEL_PATH)

Best model: Ridge with test RMSE = 3.5072468199549522
Saved best pipeline to /mnt/data/capstone_output\best_model_pipeline.joblib


In [23]:
try:

    fitted_preprocessor = preprocessor.fit(X)
    num_features = numeric_cols
    if len(categorical_cols) > 0:
        ohe = fitted_preprocessor.named_transformers_['cat'].named_steps['onehot']
        ohe_features = ohe.get_feature_names_out(categorical_cols).tolist()
    else:
        ohe_features = []
    final_features = num_features + ohe_features
    np.save(FEATURES_PATH, np.array(final_features))
    print('Saved final feature list to', FEATURES_PATH)
except Exception as e:
    print('Could not extract final feature names automatically:', e)

Saved final feature list to /mnt/data/capstone_output\feature_columns.npy


In [24]:

threshold = y_train.quantile(0.25)
print('Underperformance threshold (25th percentile):', threshold)



y_pred_test = best['pipeline'].predict(X_test)
y_test_binary = (y_test < threshold).astype(int)
y_pred_binary = (y_pred_test < threshold).astype(int)
precision = precision_score(y_test_binary, y_pred_binary, zero_division=0)
recall = recall_score(y_test_binary, y_pred_binary, zero_division=0)
f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)
print(f'Underperformance detection -> Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

Underperformance threshold (25th percentile): 17.7
Underperformance detection -> Precision: 0.9677, Recall: 0.5660, F1: 0.7143


In [25]:

with open(REPORT_PATH, 'w') as f:
    f.write('Capstone Project 1 - Model Evaluation Report')
    f.write('Dataset: ' + DATA_PATH + '')
    f.write('Target: ' + TARGET + '')
    for name, res in results.items():
        f.write(f"Model: {name}")
        f.write(f" Test RMSE: {res['rmse']:.4f}, MSE: {res['mse']:.4f}, MAE: {res['mae']:.4f}, R2: {res['r2']:.4f}, CV_RMSE: {res['cv_rmse']:.4f}")
        f.write('Best model: ' + best_model_name + '')
        f.write(f'Underperformance threshold: {threshold:.4f}')
        f.write(f'Underperformance detection metrics -> Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
        f.write('Recommendations:\n- Try hyperparameter tuning for RandomForest/XGBoost.\n- Consider feature engineering and interaction terms.\n- Monitor model performance in production and add drift detection.')
print('Wrote evaluation report to', REPORT_PATH)

Wrote evaluation report to /mnt/data/capstone_output\model_evaluation_report.txt


In [26]:

if best_model_name in ['RandomForest', 'XGBoost']:
    try:
        model = best['pipeline'].named_steps['model']
       
        if 'final_features' in locals():
            features = final_features
        else:
            features = [f'f{i}' for i in range(len(model.feature_importances_))]
        importances = model.feature_importances_
        feat_imp = pd.DataFrame({'feature': features, 'importance': importances}).sort_values('importance', ascending=False)
        print('Top feature importances:')
        print(feat_imp.head(10))
    except Exception as e:
        print('Could not compute feature importances:', e)
else:
    
    try:
        model = best['pipeline'].named_steps['model']
        if hasattr(model, 'coef_'):
            coefs = model.coef_
            if 'final_features' in locals():
                feat_df = pd.DataFrame({'feature': final_features, 'coef': coefs})
                print('Top positive coefficients:')
                print(feat_df.sort_values('coef', ascending=False).head(10))
                print('Top negative coefficients:')
                print(feat_df.sort_values('coef').head(10))
    except Exception as e:
        print('Could not display coefficients:', e)

Top positive coefficients:
                   feature      coef
11        Efficiency_Score  2.600924
3             Cooling_Time  2.522229
0    Injection_Temperature  2.286050
7      Operator_Experience  1.287217
1       Injection_Pressure  1.202363
16     Machine_Type_Type_A  0.966229
13               Shift_Day  0.811482
20  Material_Grade_Premium  0.605940
28   Day_of_Week_Wednesday  0.381171
26    Day_of_Week_Thursday  0.086685
Top negative coefficients:
                       feature      coef
10            Total_Cycle_Time -9.525092
6                  Machine_Age -1.815188
2                   Cycle_Time -1.070874
4           Material_Viscosity -0.946394
15                 Shift_Night -0.943929
9   Temperature_Pressure_Ratio -0.890353
18         Machine_Type_Type_C -0.848605
19      Material_Grade_Economy -0.644941
22          Day_of_Week_Friday -0.255952
17         Machine_Type_Type_B -0.213038
