# Manufacturing Quality Prediction Model Building

This notebook focuses on building and training models for:
1. Failure Prediction (Classification)
2. Root Cause Analysis (Error Pattern Detection)
3. Time Series Analysis (Sequential Patterns)

In [5]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [6]:
# Load the data
main_unit_df = pd.read_csv('main_unit_assembly_data.csv')
component_df = pd.read_csv('component_assembly_data.csv')

# Convert datetime columns
for df in [main_unit_df, component_df]:
    for col in ['TRNDATE', 'INSERTTIME']:
        df[col] = pd.to_datetime(df[col])

print("Data loaded successfully!")
print(f"Main unit data shape: {main_unit_df.shape}")
print(f"Component data shape: {component_df.shape}")


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Data loaded successfully!
Main unit data shape: (1000, 14)
Component data shape: (1000, 15)


In [7]:
# Feature Engineering
def create_features(df):
    # Temporal features
    df['hour'] = df['TRNDATE'].dt.hour
    df['day'] = df['TRNDATE'].dt.day
    df['month'] = df['TRNDATE'].dt.month
    df['day_of_week'] = df['TRNDATE'].dt.dayofweek
    
    # Time between operations
    df['time_diff'] = (df['TRNDATE'] - df['INSERTTIME']).dt.total_seconds()
    
    # Categorical encoding
    le = LabelEncoder()
    categorical_cols = ['LINE', 'WORKSTATION', 'STAGE', 'VENDOR']
    
    for col in categorical_cols:
        if col in df.columns:
            df[f'{col}_encoded'] = le.fit_transform(df[col])
    
    # Error code features
    if 'A_ERRORCODE' in df.columns:
        df['has_error'] = df['A_ERRORCODE'].notna().astype(int)
    
    # Pass/Fail encoding
    df['target'] = (df['RESULTFLAG'] == 'F').astype(int)
    
    return df

# Create features for both datasets
main_unit_df = create_features(main_unit_df)
component_df = create_features(component_df)

# Select features for modeling
feature_cols = [
    'hour', 'day', 'month', 'day_of_week', 'time_diff',
    'LINE_encoded', 'WORKSTATION_encoded', 'STAGE_encoded', 'VENDOR_encoded',
    'has_error'
]

# Prepare features and target
X = main_unit_df[feature_cols]
y = main_unit_df['target']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [8]:
# 1. Random Forest Model
# Hyperparameter tuning
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train, y_train)

print("Best Random Forest Parameters:")
print(rf_grid.best_params_)

# Evaluate Random Forest
rf_pred = rf_grid.predict(X_test)
print("\nRandom Forest Performance:")
print(classification_report(y_test, rf_pred))

# Feature importance
rf_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_grid.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)

fig = px.bar(rf_importance, x='feature', y='importance',
             title='Random Forest Feature Importance')
fig.show()

Best Random Forest Parameters:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       181
           1       0.00      0.00      0.00        19

    accuracy                           0.91       200
   macro avg       0.45      0.50      0.48       200
weighted avg       0.82      0.91      0.86       200




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# 2. XGBoost Model
# Hyperparameter tuning
xgb_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5]
}

xgb_model = xgb.XGBClassifier(random_state=42)
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

print("Best XGBoost Parameters:")
print(xgb_grid.best_params_)

# Evaluate XGBoost
xgb_pred = xgb_grid.predict(X_test)
print("\nXGBoost Performance:")
print(classification_report(y_test, xgb_pred))

In [None]:
# 3. LSTM Model for Sequential Patterns
def prepare_sequences(X, y, timesteps=5):
    X_seq, y_seq = [], []
    for i in range(len(X) - timesteps):
        X_seq.append(X[i:(i + timesteps)])
        y_seq.append(y[i + timesteps])
    return np.array(X_seq), np.array(y_seq)

# Prepare sequential data
X_seq_train, y_seq_train = prepare_sequences(X_train, y_train)
X_seq_test, y_seq_test = prepare_sequences(X_test, y_test)

# Build LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_seq_train.shape[1], X_seq_train.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

# Train LSTM
history = lstm_model.fit(
    X_seq_train, y_seq_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate LSTM
lstm_pred = (lstm_model.predict(X_seq_test) > 0.5).astype(int)
print("\nLSTM Performance:")
print(classification_report(y_seq_test, lstm_pred))

In [None]:
# Model Comparison and Visualization
def plot_model_comparison():
    models = ['Random Forest', 'XGBoost', 'LSTM']
    predictions = [rf_pred, xgb_pred, lstm_pred.flatten()]
    y_tests = [y_test, y_test, y_seq_test]
    
    metrics = []
    for model, pred, y_true in zip(models, predictions, y_tests):
        metrics.append({
            'Model': model,
            'Accuracy': accuracy_score(y_true, pred),
            'F1-Score': f1_score(y_true, pred),
            'ROC-AUC': roc_auc_score(y_true, pred)
        })
    
    metrics_df = pd.DataFrame(metrics)
    
    fig = make_subplots(rows=1, cols=3,
                        subplot_titles=('Accuracy', 'F1-Score', 'ROC-AUC'))
    
    for i, metric in enumerate(['Accuracy', 'F1-Score', 'ROC-AUC']):
        fig.add_trace(
            go.Bar(x=metrics_df['Model'],
                   y=metrics_df[metric],
                   name=metric),
            row=1, col=i+1
        )
    
    fig.update_layout(height=400, width=1200,
                      title_text="Model Performance Comparison")
    fig.show()

plot_model_comparison()

# Save the best performing model
import joblib

# Save models and scaler
joblib.dump(rf_grid.best_estimator_, 'rf_model.joblib')
joblib.dump(xgb_grid.best_estimator_, 'xgb_model.joblib')
lstm_model.save('lstm_model.h5')
joblib.dump(scaler, 'scaler.joblib')

print("Models saved successfully!")

# Save feature importance for deployment
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_grid.best_estimator_.feature_importances_
})
feature_importance.to_csv('feature_importance.csv', index=False)

In [9]:
# Save models and scaler for dashboard
import joblib

# Save Random Forest model
joblib.dump(rf_grid.best_estimator_, 'manufacturing_quality_model.joblib')
joblib.dump(scaler, 'feature_scaler.joblib')

# Save feature columns for reference
import json
with open('feature_columns.json', 'w') as f:
    json.dump(feature_cols, f)

print("Models and features saved successfully!")
print("Saved files:")
print("1. manufacturing_quality_model.joblib")
print("2. feature_scaler.joblib")
print("3. feature_columns.json")

Models and features saved successfully!
Saved files:
1. manufacturing_quality_model.joblib
2. feature_scaler.joblib
3. feature_columns.json
