# Data Preprocessing

In [6]:
import pandas as pd
import numpy as np

# Create the data dictionary
data = {
    'Product_ID': ['P001', 'P002', 'P003', 'P004', 'P005', 'P006', 'P007', 'P008', 'P009', 'P010'],
    'Production_Output': [500, 520, np.nan, 485, 510, 700, 480, 515, np.nan, 495],
    'Machine_Hours': [25, 26, 24, 28, 30, 35, 22, 27, np.nan, 25],
    'Labor_Cost': [300, 310, 295, 290, np.nan, 600, 280, 305, np.nan, 300],
    'Defect_Rate (%)': [2.5, 3.1, 2.0, np.nan, 4.5, 10.0, 2.9, 3.0, np.nan, 2.6],
    'Energy_Consumption (kWh)': [1500, 1520, 1480, 1450, 1550, 1700, 1490, 1515, np.nan, 1500]
}

# Convert the data dictionary into a pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df

Unnamed: 0,Product_ID,Production_Output,Machine_Hours,Labor_Cost,Defect_Rate (%),Energy_Consumption (kWh)
0,P001,500.0,25.0,300.0,2.5,1500.0
1,P002,520.0,26.0,310.0,3.1,1520.0
2,P003,,24.0,295.0,2.0,1480.0
3,P004,485.0,28.0,290.0,,1450.0
4,P005,510.0,30.0,,4.5,1550.0
5,P006,700.0,35.0,600.0,10.0,1700.0
6,P007,480.0,22.0,280.0,2.9,1490.0
7,P008,515.0,27.0,305.0,3.0,1515.0
8,P009,,,,,
9,P010,495.0,25.0,300.0,2.6,1500.0


In [8]:
# 1. Find Null Values
print("Null values in the dataset:\n", df.isnull().sum())

Null values in the dataset:
 Product_ID                  0
Production_Output           2
Machine_Hours               1
Labor_Cost                  2
Defect_Rate (%)             2
Energy_Consumption (kWh)    1
dtype: int64


In [14]:
 #2. Handle Null Values (mean imputation)
df['Production_Output'] = df['Production_Output'].fillna(df['Production_Output'].mean())
df['Machine_Hours'] = df['Machine_Hours'].fillna(df['Machine_Hours'].mean())
df['Labor_Cost'] = df['Labor_Cost'].fillna(df['Labor_Cost'].mean())
df['Defect_Rate (%)'] = df['Defect_Rate (%)'].fillna(df['Defect_Rate (%)'].mean())
df['Energy_Consumption (kWh)'] = df['Energy_Consumption (kWh)'].fillna(df['Energy_Consumption (kWh)'].mean())

In [16]:
# Check if any null values remain
print("\nNull values after imputation:\n", df.isnull().sum())


Null values after imputation:
 Product_ID                  0
Production_Output           0
Machine_Hours               0
Labor_Cost                  0
Defect_Rate (%)             0
Energy_Consumption (kWh)    0
dtype: int64


In [22]:
print("\nData after handling null values:\n")
df


Data after handling null values:



Unnamed: 0,Product_ID,Production_Output,Machine_Hours,Labor_Cost,Defect_Rate (%),Energy_Consumption (kWh)
0,P001,500.0,25.0,300.0,2.5,1500.0
1,P002,520.0,26.0,310.0,3.1,1520.0
2,P003,525.625,24.0,295.0,2.0,1480.0
3,P004,485.0,28.0,290.0,3.825,1450.0
4,P005,510.0,30.0,335.0,4.5,1550.0
5,P006,700.0,35.0,600.0,10.0,1700.0
6,P007,480.0,22.0,280.0,2.9,1490.0
7,P008,515.0,27.0,305.0,3.0,1515.0
8,P009,525.625,26.888889,335.0,3.825,1522.777778
9,P010,495.0,25.0,300.0,2.6,1500.0


In [24]:
# 3. Find Outliers using the IQR method
def find_outliers_IQR(data, column_name):
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column_name] < lower_bound) | (data[column_name] > upper_bound)]
    return outliers

# Detecting outliers in numeric columns
print("\nOutliers in Production_Output:\n", find_outliers_IQR(df, 'Production_Output'))
print("\nOutliers in Machine_Hours:\n", find_outliers_IQR(df, 'Machine_Hours'))
print("\nOutliers in Labor_Cost:\n", find_outliers_IQR(df, 'Labor_Cost'))
print("\nOutliers in Defect_Rate (%):\n", find_outliers_IQR(df, 'Defect_Rate (%)'))
print("\nOutliers in Energy_Consumption (kWh):\n", find_outliers_IQR(df, 'Energy_Consumption (kWh)'))


Outliers in Production_Output:
   Product_ID  Production_Output  Machine_Hours  Labor_Cost  Defect_Rate (%)  \
5       P006              700.0           35.0       600.0             10.0   

   Energy_Consumption (kWh)  
5                    1700.0  

Outliers in Machine_Hours:
   Product_ID  Production_Output  Machine_Hours  Labor_Cost  Defect_Rate (%)  \
5       P006              700.0           35.0       600.0             10.0   

   Energy_Consumption (kWh)  
5                    1700.0  

Outliers in Labor_Cost:
   Product_ID  Production_Output  Machine_Hours  Labor_Cost  Defect_Rate (%)  \
5       P006              700.0           35.0       600.0             10.0   

   Energy_Consumption (kWh)  
5                    1700.0  

Outliers in Defect_Rate (%):
   Product_ID  Production_Output  Machine_Hours  Labor_Cost  Defect_Rate (%)  \
5       P006              700.0           35.0       600.0             10.0   

   Energy_Consumption (kWh)  
5                    1700.0  

Outl

In [64]:
# 3. Find and Remove Outliers using the IQR method
def remove_outliers_IQR(data, column_name):
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter the data to remove outliers
    data_without_outliers = data[(data[column_name] >= lower_bound) & (data[column_name] <= upper_bound)]
    return data_without_outliers

# Remove outliers for all numeric columns
for column in ['Production_Output', 'Machine_Hours', 'Labor_Cost', 'Defect_Rate (%)', 'Energy_Consumption (kWh)']:
    df = remove_outliers_IQR(df, column)
df

Unnamed: 0,Product_ID,Production_Output,Machine_Hours,Labor_Cost,Defect_Rate (%),Energy_Consumption (kWh)
0,P001,500.0,25.0,300.0,2.5,1500.0
1,P002,520.0,26.0,310.0,3.1,1520.0
2,P003,525.625,24.0,295.0,2.0,1480.0
7,P008,515.0,27.0,305.0,3.0,1515.0
9,P010,495.0,25.0,300.0,2.6,1500.0


In [26]:
# Drop the 'Product_ID' column since it's not numeric
df_numeric = df.drop(columns=['Product_ID'])

In [28]:
df_numeric

Unnamed: 0,Production_Output,Machine_Hours,Labor_Cost,Defect_Rate (%),Energy_Consumption (kWh)
0,500.0,25.0,300.0,2.5,1500.0
1,520.0,26.0,310.0,3.1,1520.0
2,525.625,24.0,295.0,2.0,1480.0
3,485.0,28.0,290.0,3.825,1450.0
4,510.0,30.0,335.0,4.5,1550.0
5,700.0,35.0,600.0,10.0,1700.0
6,480.0,22.0,280.0,2.9,1490.0
7,515.0,27.0,305.0,3.0,1515.0
8,525.625,26.888889,335.0,3.825,1522.777778
9,495.0,25.0,300.0,2.6,1500.0


In [34]:
from sklearn.preprocessing import MinMaxScaler
# Initialize scalers
normal_scaler = MinMaxScaler()

In [40]:
# Apply Normal Scaler (MinMaxScaler)
df_normal_scaled = pd.DataFrame(normal_scaler.fit_transform(df_numeric), columns=df_numeric.columns)
print("\nData after MinMaxScaler (Normal Scaling):\n")
df_normal_scaled


Data after MinMaxScaler (Normal Scaling):



Unnamed: 0,Production_Output,Machine_Hours,Labor_Cost,Defect_Rate (%),Energy_Consumption (kWh)
0,0.090909,0.230769,0.0625,0.0625,0.2
1,0.181818,0.307692,0.09375,0.1375,0.28
2,0.207386,0.153846,0.046875,0.0,0.12
3,0.022727,0.461538,0.03125,0.228125,0.0
4,0.136364,0.615385,0.171875,0.3125,0.4
5,1.0,1.0,1.0,1.0,1.0
6,0.0,0.0,0.0,0.1125,0.16
7,0.159091,0.384615,0.078125,0.125,0.26
8,0.207386,0.376068,0.171875,0.228125,0.291111
9,0.068182,0.230769,0.0625,0.075,0.2


In [50]:
from sklearn.preprocessing import StandardScaler

In [52]:
# Initialize scalers
standard_scaler = StandardScaler()

In [56]:
#Applying Standard Scaler
df_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(df_numeric), columns=df_numeric.columns)
print("\nData after StandardScaler:\n")
df_standard_scaled


Data after StandardScaler:



Unnamed: 0,Production_Output,Machine_Hours,Labor_Cost,Defect_Rate (%),Energy_Consumption (kWh)
0,-0.426362,-0.5524841,-0.389249,-0.609655,-0.353769
1,-0.093592,-0.2599925,-0.278035,-0.333585,-0.043143
2,0.0,-0.8449757,-0.444857,-0.839713,-0.664396
3,-0.67594,0.3249907,-0.500464,0.0,-1.130336
4,-0.259977,0.9099738,0.0,0.310579,0.422797
5,2.901343,2.372432,2.947175,2.841221,2.752497
6,-0.759133,-1.429959,-0.611678,-0.425608,-0.509082
7,-0.176784,0.03249907,-0.333642,-0.379596,-0.120799
8,0.0,-1.039139e-15,0.0,0.0,0.0
9,-0.509555,-0.5524841,-0.389249,-0.563643,-0.353769
