In [6]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Read the data
demographic_data = pd.read_csv('/Users/minu/Desktop/test_heroku/Component 04/Backend/Dataset/demographic_data_dataset.csv')  # Update path
defect_data = pd.read_csv('/Users/minu/Desktop/test_heroku/Component 04/Backend/Dataset/updated_worker_defect_details.csv')  # Update path

# Combine datasets on Worker_ID
combined_data = pd.merge(defect_data, demographic_data, on='Worker_ID')

# Convert Date columns to datetime
combined_data['Date'] = pd.to_datetime(combined_data['Date'])
combined_data['Joining_Date'] = pd.to_datetime(combined_data['Joining_Date'])

# Calculate experience in days
combined_data['Experience'] = (combined_data['Date'] - combined_data['Joining_Date']).dt.days

# Drop unnecessary columns
fields_to_drop = ['Name', 'Joining_Date']
combined_data.drop(columns=fields_to_drop, inplace=True)

# Define categorical and numerical features for one-hot encoding and scaling
categorical_features = ['Gender', 'Skill_Level', 'Shift']
numerical_features = ['Age', 'Production_Volume', 'Experience']

# Ensure that all columns are present
missing_columns = [col for col in categorical_features + numerical_features if col not in combined_data.columns]
if missing_columns:
    raise ValueError(f"The following columns are missing in the combined dataset: {missing_columns}")

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

# Separate features and target variables
X = combined_data.drop(columns=['Run_Off_D1', 'Open_Seam_D2', 'SPI_Errors_D3', 'High_Low_D4', 'defect_count', 'count', 'Worker_ID', 'Date'])
y = combined_data[['Run_Off_D1', 'Open_Seam_D2', 'SPI_Errors_D3', 'High_Low_D4']]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define multiple models
models = {
    'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)),
    'GradientBoosting': MultiOutputRegressor(GradientBoostingRegressor(random_state=42)),
    'LinearRegression': MultiOutputRegressor(LinearRegression()),
    'SupportVector': MultiOutputRegressor(SVR())
}

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    results[model_name] = -cv_scores.mean()
    print(f"{model_name}: Mean CV MSE = {-cv_scores.mean()}")
print()

# Select the best model
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

print(f"Best model: {best_model_name} with Mean CV MSE = {results[best_model_name]}")
print()

# Train the best model on the full training data
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', best_model)
])
pipeline.fit(X_train, y_train)

# Save the trained pipeline
joblib.dump(pipeline, 'best_model_pipeline.pkl')


  combined_data['Date'] = pd.to_datetime(combined_data['Date'])
  combined_data['Joining_Date'] = pd.to_datetime(combined_data['Joining_Date'])


RandomForest: Mean CV MSE = 6.913667452380953
GradientBoosting: Mean CV MSE = 7.518078092328734
LinearRegression: Mean CV MSE = 4.972604323072917
SupportVector: Mean CV MSE = 5.422002784556483

Best model: LinearRegression with Mean CV MSE = 4.972604323072917



['best_model_pipeline.pkl']

In [3]:
import joblib
import pandas as pd

# Load the saved model
model_pipeline = joblib.load('/Users/minu/Desktop/test_heroku/Component 04/Backend/best_model_pipeline.pkl')

def predict(data):
    # Convert data to DataFrame
    data_df = pd.DataFrame(data)
    # Predict using the loaded model
    predictions = model_pipeline.predict(data_df)
    return predictions

# Example data for prediction
example_data = {
    'Gender': ['Female'],
    'Skill_Level': ['Beginer'],
    'Shift': ['Morning'],
    'Age': [30],
    'Production_Volume': [1000],
    'Experience': [256]
}

# Make predictions
preds = predict(example_data)
print(preds)

[[-26.13274526  16.55810667  19.56325248   4.35137448]]
