In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib
import os

# Paths to the saved models and forecasts
time_series_forecasts_path = '/Users/minu/Desktop/R24-066/Component 04/Backend/Dataset/worker_forecasts_dataset/time_series_forecasts.csv'
traditional_model_path = '/Users/minu/Desktop/R24-066/Component 04/Backend/Save_model/best_traditional_model.pkl'

# Load the saved forecasts
time_series_forecasts = pd.read_csv(time_series_forecasts_path, index_col=0)
traditional_model = joblib.load(traditional_model_path)

# Load the original combined data
demographic_data = pd.read_csv('/Users/minu/Desktop/R24-066/Component 04/Backend/Dataset/demographic_data_dataset.csv') 
defect_data = pd.read_csv('/Users/minu/Desktop/R24-066/Component 04/Backend/Dataset/updated_worker_defect_details.csv')

# Convert Date columns to datetime
demographic_data['Joining_Date'] = pd.to_datetime(demographic_data['Joining_Date'])
defect_data['Date'] = pd.to_datetime(defect_data['Date'])

# Combine datasets on Worker_ID
combined_data = pd.merge(defect_data, demographic_data, on='Worker_ID')

# Drop unnecessary columns
fields_to_drop = ['Name', 'Joining_Date']
combined_data.drop(columns=fields_to_drop, inplace=True)

# Rename defect columns
defect_columns_mapping = {
    'Run_Off_D1': 'Run_Off',
    'Open_Seam_D2': 'Open_Seam',
    'SPI_Errors_D3': 'SPI_Errors',
    'High_Low_D4': 'High_Low'
}
combined_data.rename(columns=defect_columns_mapping, inplace=True)

# Prepare the data for the fusion model
def prepare_fusion_data(time_series_forecasts, traditional_model, combined_data, defect_types):
    # Generate traditional model predictions
    X_combined = combined_data.drop(columns=['Run_Off', 'Open_Seam', 'SPI_Errors', 'High_Low', 'defect_count', 'count', 'Worker_ID', 'Date'])
    traditional_predictions = traditional_model.predict(X_combined)
    
    # Create a DataFrame for traditional predictions
    traditional_predictions_df = pd.DataFrame(traditional_predictions, columns=defect_types, index=combined_data.index)
    
    # Combine traditional and time series predictions
    combined_forecasts = time_series_forecasts.add(traditional_predictions_df, fill_value=0)
    
    return combined_forecasts

# Prepare the fusion data
defect_types = ['Run_Off', 'Open_Seam', 'SPI_Errors', 'High_Low']
combined_forecasts = prepare_fusion_data(time_series_forecasts, traditional_model, combined_data, defect_types)

# Extract target values
y_actual = combined_data[defect_types]

# Align combined forecasts with y_actual
combined_forecasts = combined_forecasts.loc[y_actual.index]

# Train the fusion model using MultiOutputRegressor with Gradient Boosting Regressor
X_train, X_test, y_train, y_test = train_test_split(combined_forecasts, y_actual, test_size=0.2, random_state=42)
fusion_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
fusion_model.fit(X_train, y_train)

# Evaluate the fusion model
y_pred = fusion_model.predict(X_test)
mse_values = mean_squared_error(y_test, y_pred, multioutput='raw_values')
for defect_type, mse in zip(defect_types, mse_values):
    print(f"Fusion Model Mean Squared Error for {defect_type}: {mse}")

# Overall MSE
overall_mse = mean_squared_error(y_test, y_pred)
print(f"Fusion Model Overall Mean Squared Error: {overall_mse}")

# Save the fusion model
fusion_model_output_path = '/Users/minu/Desktop/R24-066/Component 04/Backend/Save_model/fusion_model.pkl'
joblib.dump(fusion_model, fusion_model_output_path)

Fusion Model Mean Squared Error for Run_Off: 5.292562991796079
Fusion Model Mean Squared Error for Open_Seam: 3.0785639742247186
Fusion Model Mean Squared Error for SPI_Errors: 4.98624927690371
Fusion Model Mean Squared Error for High_Low: 5.542851679266722
Fusion Model Overall Mean Squared Error: 4.725056980547807


  demographic_data['Joining_Date'] = pd.to_datetime(demographic_data['Joining_Date'])
  defect_data['Date'] = pd.to_datetime(defect_data['Date'])


['/Users/minu/Desktop/R24-066/Component 04/Backend/Save_model/fusion_model.pkl']