In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA

# Read the data
demographic_data = pd.read_csv('/Users/minu/Desktop/test_heroku/Component 04/Backend/Dataset/demographic_data_dataset.csv')  # Update path
defect_data = pd.read_csv('/Users/minu/Desktop/test_heroku/Component 04/Backend/Dataset/updated_worker_defect_details.csv')  # Update path

# Convert Date columns to datetime
demographic_data['Joining_Date'] = pd.to_datetime(demographic_data['Joining_Date'], infer_datetime_format=True)
defect_data['Date'] = pd.to_datetime(defect_data['Date'], infer_datetime_format=True)

# Combine datasets on Worker_ID
combined_data = pd.merge(defect_data, demographic_data, on='Worker_ID')

# Calculate experience in days
combined_data['Experience'] = (combined_data['Date'] - combined_data['Joining_Date']).dt.days

# Drop unnecessary columns
fields_to_drop = ['Name', 'Joining_Date']
combined_data.drop(columns=fields_to_drop, inplace=True)

# Define categorical features for one-hot encoding
categorical_features = ['Gender', 'Skill_Level', 'Shift']
numerical_features = ['Age', 'Production_Volume', 'Experience']

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

# Separate features and target variables
X = combined_data.drop(columns=['Run_Off_D1', 'Open_Seam_D2', 'SPI_Errors_D3', 'High_Low_D4', 'defect_count', 'count', 'Worker_ID', 'Date'])
y = combined_data[['Run_Off_D1', 'Open_Seam_D2', 'SPI_Errors_D3', 'High_Low_D4']]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define multiple models
models = {
    'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)),
    'GradientBoosting': MultiOutputRegressor(GradientBoostingRegressor(random_state=42)),
    'LinearRegression': MultiOutputRegressor(LinearRegression()),
    'SupportVector': MultiOutputRegressor(SVR())
}

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    results[model_name] = -cv_scores.mean()  # Negate because cross_val_score uses negative MSE
    print(f"{model_name}: Mean CV MSE = {-cv_scores.mean()}")
print()

# Select the best model
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

print(f"Best model: {best_model_name} with Mean CV MSE = {results[best_model_name]}")
print()

# Train the best model on the full training data
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', best_model)
])
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred_test = pipeline.predict(X_test)

# Evaluate the model and print MSE for each defect type
mse_values = mean_squared_error(y_test, y_pred_test, multioutput='raw_values')
defect_types = ['Run_Off_D1', 'Open_Seam_D2', 'SPI_Errors_D3', 'High_Low_D4']

for defect_type, mse in zip(defect_types, mse_values):
    print(f"Mean Squared Error for {defect_type}: {mse}")
print()

# Overall MSE
overall_mse = mean_squared_error(y_test, y_pred_test)
print(f"Overall Mean Squared Error: {overall_mse}")

# Additional performance metrics
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print()

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred_test)
print(f"Mean Absolute Percentage Error: {mape}")

# Cross-validation scores
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-Validation Mean Squared Error: {-cv_scores.mean()}")

  demographic_data['Joining_Date'] = pd.to_datetime(demographic_data['Joining_Date'], infer_datetime_format=True)
  demographic_data['Joining_Date'] = pd.to_datetime(demographic_data['Joining_Date'], infer_datetime_format=True)
  defect_data['Date'] = pd.to_datetime(defect_data['Date'], infer_datetime_format=True)
  defect_data['Date'] = pd.to_datetime(defect_data['Date'], infer_datetime_format=True)


RandomForest: Mean CV MSE = 6.913667452380953
GradientBoosting: Mean CV MSE = 7.518078092328734
LinearRegression: Mean CV MSE = 4.972604323072917
SupportVector: Mean CV MSE = 5.422002784556483

Best model: LinearRegression with Mean CV MSE = 4.972604323072917

Mean Squared Error for Run_Off_D1: 5.000910758316218
Mean Squared Error for Open_Seam_D2: 3.281276540349336
Mean Squared Error for SPI_Errors_D3: 4.778603083204815
Mean Squared Error for High_Low_D4: 5.5279410378289695

Overall Mean Squared Error: 4.647182854924835
R-squared: -0.03815803820165656
Mean Absolute Error: 1.9113938914943778

Mean Absolute Percentage Error: inf
Cross-Validation Mean Squared Error: 4.89620714860129
