# Feature Engineering

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures


# Loading csv to Dataframe
df = pd.read_csv('manufacturing_defect_dataset.csv')

# Show first rows of Dataframe
df.head()

Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,WorkerProductivity,SafetyIncidents,EnergyConsumption,EnergyEfficiency,AdditiveProcessTime,AdditiveMaterialCost,DefectStatus,MaintenanceHoursPerDay
0,202,13175.403783,86.648534,1,3.121492,63.463494,9,0.052343,8.630515,0.081322,85.042379,0,2419.616785,0.468947,5.551639,236.439301,1,1.285714
1,535,19770.046093,86.310664,4,0.819531,83.697818,20,4.908328,9.296598,0.038486,99.657443,7,3915.566713,0.119485,9.080754,353.957631,1,2.857143
2,960,19060.820997,82.132472,0,4.514504,90.35055,1,2.464923,5.097486,0.002887,92.819264,2,3392.385362,0.496392,6.562827,396.189402,1,0.142857
3,370,5647.606037,87.335966,5,0.638524,67.62869,8,4.692476,3.577616,0.055331,96.887013,8,4652.400275,0.183125,8.097496,164.13587,1,1.142857
4,206,7472.222236,81.989893,3,3.867784,82.728334,9,2.746726,6.851709,0.068047,88.315554,7,1581.630332,0.263507,6.406154,365.708964,1,1.285714


#### Polynomial Features

In [9]:
# Define key features for polynomial expansion
key_features = ['ProductionVolume', 'SupplierQuality', 'MaintenanceHoursPerDay']

# Initialize PolynomialFeatures for degree 2
poly = PolynomialFeatures(degree=2, include_bias=False)

# Transform the selected features
poly_features = poly.fit_transform(df[key_features])

# Convert to DataFrame and name columns
poly_feature_names = poly.get_feature_names_out(key_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

# Concatenate new polynomial features with the original dataset
dataset_with_poly = pd.concat([df, poly_df], axis=1)

# Display the transformed dataset to verify
dataset_with_poly.head()


Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,...,MaintenanceHoursPerDay,ProductionVolume.1,SupplierQuality.1,MaintenanceHoursPerDay.1,ProductionVolume^2,ProductionVolume SupplierQuality,ProductionVolume MaintenanceHoursPerDay,SupplierQuality^2,SupplierQuality MaintenanceHoursPerDay,MaintenanceHoursPerDay^2
0,202,13175.403783,86.648534,1,3.121492,63.463494,9,0.052343,8.630515,0.081322,...,1.285714,202.0,86.648534,1.285714,40804.0,17503.003836,259.714286,7507.968417,111.405258,1.653061
1,535,19770.046093,86.310664,4,0.819531,83.697818,20,4.908328,9.296598,0.038486,...,2.857143,535.0,86.310664,2.857143,286225.0,46176.205033,1528.571429,7449.530653,246.601896,8.163265
2,960,19060.820997,82.132472,0,4.514504,90.35055,1,2.464923,5.097486,0.002887,...,0.142857,960.0,82.132472,0.142857,921600.0,78847.173484,137.142857,6745.743019,11.73321,0.020408
3,370,5647.606037,87.335966,5,0.638524,67.62869,8,4.692476,3.577616,0.055331,...,1.142857,370.0,87.335966,1.142857,136900.0,32314.307573,422.857143,7627.571029,99.812533,1.306122
4,206,7472.222236,81.989893,3,3.867784,82.728334,9,2.746726,6.851709,0.068047,...,1.285714,206.0,81.989893,1.285714,42436.0,16889.918036,264.857143,6722.342616,105.415577,1.653061


#### Interaction Terms

In [14]:
# Check data types
print(dataset_with_poly[['ProductionVolume', 'EnergyConsumption', 'MaintenanceHoursPerDay', 
                         'DowntimePercentage', 'ProductionCost', 'WorkerProductivity']].dtypes)


ProductionVolume            int64
EnergyConsumption         float64
MaintenanceHoursPerDay    float64
DowntimePercentage        float64
ProductionCost            float64
WorkerProductivity        float64
dtype: object


In [15]:
dataset_with_poly['ProductionVolume'] = dataset_with_poly['ProductionVolume'].astype(float)
print(dataset_with_poly[['ProductionVolume', 'EnergyConsumption', 'MaintenanceHoursPerDay', 
                         'DowntimePercentage', 'ProductionCost', 'WorkerProductivity']].dtypes)

ProductionVolume          float64
EnergyConsumption         float64
MaintenanceHoursPerDay    float64
DowntimePercentage        float64
ProductionCost            float64
WorkerProductivity        float64
dtype: object


In [16]:
dataset_with_poly['ProductionEfficiency'] = dataset_with_poly['ProductionVolume'] / dataset_with_poly['EnergyConsumption']
dataset_with_poly['EnergyEfficiency_MultipleFactors'] = dataset_with_poly['ProductionVolume'] / (dataset_with_poly['EnergyConsumption'] + dataset_with_poly['MaintenanceHoursPerDay'] + dataset_with_poly['DowntimePercentage']) 
dataset_with_poly['EnergyEfficiency_costs'] = dataset_with_poly['EnergyConsumption'] / dataset_with_poly['ProductionCost']
dataset_with_poly['EnergyEfficiency_Workforce'] = dataset_with_poly['WorkerProductivity'] / dataset_with_poly['EnergyConsumption']

dataset_with_poly.head()

Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,...,ProductionVolume^2,ProductionVolume SupplierQuality,ProductionVolume MaintenanceHoursPerDay,SupplierQuality^2,SupplierQuality MaintenanceHoursPerDay,MaintenanceHoursPerDay^2,ProductionEfficiency,EnergyEfficiency_MultipleFactors,EnergyEfficiency_costs,EnergyEfficiency_Workforce
0,202.0,13175.403783,86.648534,1,3.121492,63.463494,9,0.052343,8.630515,0.081322,...,40804.0,17503.003836,259.714286,7507.968417,111.405258,1.653061,0.083484,0.083438,0.183646,0.035147
1,535.0,19770.046093,86.310664,4,0.819531,83.697818,20,4.908328,9.296598,0.038486,...,286225.0,46176.205033,1528.571429,7449.530653,246.601896,8.163265,0.136634,0.136364,0.198056,0.025452
2,960.0,19060.820997,82.132472,0,4.514504,90.35055,1,2.464923,5.097486,0.002887,...,921600.0,78847.173484,137.142857,6745.743019,11.73321,0.020408,0.282987,0.282769,0.177977,0.027361
3,370.0,5647.606037,87.335966,5,0.638524,67.62869,8,4.692476,3.577616,0.055331,...,136900.0,32314.307573,422.857143,7627.571029,99.812533,1.306122,0.079529,0.079429,0.823783,0.020825
4,206.0,7472.222236,81.989893,3,3.867784,82.728334,9,2.746726,6.851709,0.068047,...,42436.0,16889.918036,264.857143,6722.342616,105.415577,1.653061,0.130245,0.129914,0.211668,0.055838


In [17]:
# Display the total number of columns and the column names
columns = dataset_with_poly.columns
num_columns = len(columns)

print(f"Total number of columns: {num_columns}")
print("Column names:")
print(columns)


Total number of columns: 28
Column names:
Index(['ProductionVolume', 'ProductionCost', 'SupplierQuality',
       'DeliveryDelay', 'DefectRate', 'QualityScore', 'MaintenanceHours',
       'DowntimePercentage', 'InventoryTurnover', 'StockoutRate',
       'WorkerProductivity', 'SafetyIncidents', 'EnergyConsumption',
       'EnergyEfficiency', 'AdditiveProcessTime', 'AdditiveMaterialCost',
       'DefectStatus', 'MaintenanceHoursPerDay', 'ProductionVolume^2',
       'ProductionVolume SupplierQuality',
       'ProductionVolume MaintenanceHoursPerDay', 'SupplierQuality^2',
       'SupplierQuality MaintenanceHoursPerDay', 'MaintenanceHoursPerDay^2',
       'ProductionEfficiency', 'EnergyEfficiency_MultipleFactors',
       'EnergyEfficiency_costs', 'EnergyEfficiency_Workforce'],
      dtype='object')


#### Feature Scaling

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
columns_to_scale = [
    'ProductionVolume', 'ProductionCost', 'SupplierQuality', 'DowntimePercentage',
    'InventoryTurnover', 'WorkerProductivity', 'EnergyConsumption', 'AdditiveProcessTime',
    'MaintenanceHoursPerDay', 'ProductionVolume^2', 'ProductionVolume SupplierQuality',
    'ProductionVolume MaintenanceHoursPerDay', 'SupplierQuality^2',
    'SupplierQuality MaintenanceHoursPerDay', 'MaintenanceHoursPerDay^2',
    'ProductionEfficiency', 'EnergyEfficiency_MultipleFactors',
    'EnergyEfficiency_costs', 'EnergyEfficiency_Workforce'
]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the relevant columns
dataset_with_poly[columns_to_scale] = scaler.fit_transform(dataset_with_poly[columns_to_scale])

# Verify scaling by checking descriptive statistics
print(dataset_with_poly[columns_to_scale].describe().round(2))

       ProductionVolume  ProductionCost  SupplierQuality  DowntimePercentage  \
count           3240.00         3240.00          3240.00             3240.00   
mean              -0.00           -0.00             0.00                0.00   
std                1.00            1.00             1.00                1.00   
min               -1.71           -1.72            -1.71               -1.73   
25%               -0.86           -0.86            -0.86               -0.86   
50%                0.00           -0.00            -0.02               -0.03   
75%                0.86            0.86             0.86                0.88   
max                1.72            1.76             1.76                1.73   

       InventoryTurnover  WorkerProductivity  EnergyConsumption  \
count            3240.00             3240.00            3240.00   
mean               -0.00                0.00              -0.00   
std                 1.00                1.00               1.00   
min        

In [22]:
dataset_with_poly.head()

Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,...,ProductionVolume^2,ProductionVolume SupplierQuality,ProductionVolume MaintenanceHoursPerDay,SupplierQuality^2,SupplierQuality MaintenanceHoursPerDay,MaintenanceHoursPerDay^2,ProductionEfficiency,EnergyEfficiency_MultipleFactors,EnergyEfficiency_costs,EnergyEfficiency_Workforce
0,-1.320785,0.174673,-0.553077,1,3.121492,63.463494,9,-1.696637,1.120811,0.081322,...,-1.118988,-1.336137,-0.869406,-0.574153,-0.403911,-0.603542,-0.869439,-0.869703,-0.577786,-0.067387
1,-0.051544,1.705681,-0.611752,4,0.819531,83.697818,20,1.667488,1.406753,0.038486,...,-0.284032,-0.128816,0.858535,-0.630523,1.122884,1.362449,-0.535263,-0.535848,-0.489696,-0.61332
2,1.568358,1.541028,-1.337353,0,4.514504,90.35055,1,-0.025252,-0.39588,0.002887,...,1.877602,1.246836,-1.036324,-1.309413,-1.529522,-1.096581,0.384918,0.387683,-0.612448,-0.505802
3,-0.680447,-1.572975,-0.433695,5,0.638524,67.62869,8,1.51795,-1.048344,0.055331,...,-0.792056,-0.712489,-0.647236,-0.458781,-0.53483,-0.708313,-0.894308,-0.894992,3.335726,-0.873826
4,-1.305539,-1.149373,-1.362114,3,3.867784,82.728334,9,0.169975,0.357189,0.068047,...,-1.113435,-1.361952,-0.862402,-1.331986,-0.471554,-0.603542,-0.575432,-0.576532,-0.406475,1.097702


In [23]:
dataset_with_poly.to_csv('DefectData_Engineered.csv', index=False)