IMPORTING NEEDED LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import optuna

LOADING THE DATASET

In [2]:
df = pd.read_csv('manufacturing_defect_dataset.csv')
print(df.columns)

Index(['ProductionVolume', 'ProductionCost', 'SupplierQuality',
       'DeliveryDelay', 'DefectRate', 'QualityScore', 'MaintenanceHours',
       'DowntimePercentage', 'InventoryTurnover', 'StockoutRate',
       'WorkerProductivity', 'SafetyIncidents', 'EnergyConsumption',
       'EnergyEfficiency', 'AdditiveProcessTime', 'AdditiveMaterialCost',
       'DefectStatus'],
      dtype='object')


# DATA PREPROCESSING

In [3]:
# Getting the information about datatype of data in the columns .
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3240 entries, 0 to 3239
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ProductionVolume      3240 non-null   int64  
 1   ProductionCost        3240 non-null   float64
 2   SupplierQuality       3240 non-null   float64
 3   DeliveryDelay         3240 non-null   int64  
 4   DefectRate            3240 non-null   float64
 5   QualityScore          3240 non-null   float64
 6   MaintenanceHours      3240 non-null   int64  
 7   DowntimePercentage    3240 non-null   float64
 8   InventoryTurnover     3240 non-null   float64
 9   StockoutRate          3240 non-null   float64
 10  WorkerProductivity    3240 non-null   float64
 11  SafetyIncidents       3240 non-null   int64  
 12  EnergyConsumption     3240 non-null   float64
 13  EnergyEfficiency      3240 non-null   float64
 14  AdditiveProcessTime   3240 non-null   float64
 15  AdditiveMaterialCost 

In [4]:
# Checking if there is any null value in the data.
df.isnull().sum()

ProductionVolume        0
ProductionCost          0
SupplierQuality         0
DeliveryDelay           0
DefectRate              0
QualityScore            0
MaintenanceHours        0
DowntimePercentage      0
InventoryTurnover       0
StockoutRate            0
WorkerProductivity      0
SafetyIncidents         0
EnergyConsumption       0
EnergyEfficiency        0
AdditiveProcessTime     0
AdditiveMaterialCost    0
DefectStatus            0
dtype: int64

In [5]:
# Checking the Duplicate values .
df.duplicated().sum()

0

In [6]:
# Checking all the columns.
df.columns

Index(['ProductionVolume', 'ProductionCost', 'SupplierQuality',
       'DeliveryDelay', 'DefectRate', 'QualityScore', 'MaintenanceHours',
       'DowntimePercentage', 'InventoryTurnover', 'StockoutRate',
       'WorkerProductivity', 'SafetyIncidents', 'EnergyConsumption',
       'EnergyEfficiency', 'AdditiveProcessTime', 'AdditiveMaterialCost',
       'DefectStatus'],
      dtype='object')

In [7]:
# Removing unnecassary columns.
df.drop(['ProductionVolume' , 'ProductionCost' , 'DeliveryDelay' , 'InventoryTurnover' ,'StockoutRate' ,'WorkerProductivity' ,'EnergyEfficiency' , 'AdditiveProcessTime' ,'AdditiveMaterialCost'] , axis = 1 , inplace = True)

In [8]:
# Checking remaining columns .
df.columns

Index(['SupplierQuality', 'DefectRate', 'QualityScore', 'MaintenanceHours',
       'DowntimePercentage', 'SafetyIncidents', 'EnergyConsumption',
       'DefectStatus'],
      dtype='object')

In [9]:
# Checking values in columns.
df.head()

Unnamed: 0,SupplierQuality,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,SafetyIncidents,EnergyConsumption,DefectStatus
0,86.648534,3.121492,63.463494,9,0.052343,0,2419.616785,1
1,86.310664,0.819531,83.697818,20,4.908328,7,3915.566713,1
2,82.132472,4.514504,90.35055,1,2.464923,2,3392.385362,1
3,87.335966,0.638524,67.62869,8,4.692476,8,4652.400275,1
4,81.989893,3.867784,82.728334,9,2.746726,7,1581.630332,1


In [10]:
y = df.iloc[:, -1]
x = df.iloc[:, :-1]

In [11]:
# Spliting the data into training and testing dataset.
x_train , x_test , y_train , y_test = train_test_split(x, y , random_state = 42 , test_size = 0.2)

In [12]:
# Checking the shapes of datasets .
print("Training Input data shape : " ,x_train.shape)
print("Training Output data shape : " ,y_train.shape)
print("Test Input data shape : " ,x_test.shape)
print("Test Output data shape : " ,y_test.shape)

Training Input data shape :  (2592, 7)
Training Output data shape :  (2592,)
Test Input data shape :  (648, 7)
Test Output data shape :  (648,)


In [13]:
# Since the scale of values are different in datasets , so we have to scale it .

from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()

x_train = Scaler.fit_transform(x_train)
x_test = Scaler.fit_transform(x_test)

import pickle

# Saving the predictive model.
with open("Scaler.pkl", "wb") as model_file:
    pickle.dump(Scaler, model_file)

print("Scaler saved successfully!")


Scaler saved successfully!


In [14]:
# Printing scaled data.
print("First Training Row : ", x_train[0])
print("First Training output : " , y_train.iloc[0])

First Training Row :  [-0.91367607 -0.65455818  0.75128437  1.09829367 -0.725719    0.82945024
  0.65593582]
First Training output :  1


# Making Model for classification task.

Since the Randomforest is an algorithm which provide the high accuracy for the dataset and handles Non-Linearity and Complex Relationships , so we are using that algorithm for our classification task.

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

We will use optuna library for fine tuning the model's parameters .

In [16]:
# Creating the objective function .
def objective_func(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 250)
    max_depth = trial.suggest_int("max_depth", 5, 30, log=True)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    # Train the model
    model.fit(x_train, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(x_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    # score = cross_val_score(model , x_train, y_train , cv=3 ,scoring = 'accuracy').mean()
    return  accuracy 

In [17]:
study = optuna.create_study(direction="maximize")  
study.optimize(objective_func, n_trials=50, timeout=600) 

[I 2025-01-23 20:00:57,979] A new study created in memory with name: no-name-47a6d3f4-3ea6-498c-8d25-a61c7232fa08
[I 2025-01-23 20:00:58,536] Trial 0 finished with value: 0.9135802469135802 and parameters: {'n_estimators': 80, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.9135802469135802.
[I 2025-01-23 20:00:59,627] Trial 1 finished with value: 0.9151234567901234 and parameters: {'n_estimators': 130, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9151234567901234.
[I 2025-01-23 20:01:02,498] Trial 2 finished with value: 0.9120370370370371 and parameters: {'n_estimators': 149, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': None}. Best is trial 1 with value: 0.9151234567901234.
[I 2025-01-23 20:01:05,063] Trial 3 finished with value: 0.9120370370370371 and parameters: {'n_estimators': 152, 'max_depth': 26, 'min_s

In [18]:
# Finding best model parameters and best accuracy.
best_params = study.best_params
best_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'n_estimators': 91, 'max_depth': 6, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'sqrt'}
Best Accuracy: 0.9166666666666666


In [19]:
# Creating the final model .
best_model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    max_features=best_params["max_features"],
    random_state=42,
    n_jobs=-1
)
best_model.fit(x_train, y_train)

# Evaluate final model
final_accuracy = accuracy_score(y_test, best_model.predict(x_test))
print("Final Model Accuracy:", final_accuracy)

Final Model Accuracy: 0.9166666666666666


Testing the model.

In [25]:
with open('Scaler.pkl', 'rb') as f: 
    Scaler = pickle.load(f)
    
X_new = [[86.648534, 3.121492, 63.463494, 9, 0.052343, 1, 2419.616785]]
X_new = Scaler.transform(X_new)
print(X_new)
pred = best_model.predict( X_new )
print(pred)

[[-0.49245282  0.18569816 -1.49017592 -0.37509987 -1.69379592 -1.24496405
  -0.49056541]]
[1]




### Saving the model .

In [21]:
import pickle

# Saving the predictive model.
with open("final_random_forest_model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

print("Model saved successfully!")


Model saved successfully!


# Testing by giving the input data

### First run the app.py for activating the server .

In [3]:
import requests

url = 'http://127.0.0.1:5000/predict_defect_status'

# Chnage the values accordingly .
data = {
    "SupplierQuality":86.648534	 ,
    "DefectRate": 3.121492,
    "QualityScore": 63.463494	,
    "MaintenanceHours":9  ,
    "DowntimePercentage": 0.052343,
    "SafetyIncidents":1 ,
    "EnergyConsumption": 2419.616785,	
}
# headers = {'Content-Type': 'application/json'}

response = requests.post(url,json=data)
if response.status_code == 200:
    response = response.json()
    defecet_status = "NO"
    if response['DefectStatus'] == 1:
         defecet_status = "YES"
    print("Defect Status is :" , defecet_status)
    print("Confidence score is : " , response['ConfidenceScore'])
else:
    print(f"Error: {response.status_code}")

Defect Status is : YES
Confidence score is :  0.8925836770921677
