In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Manufacturing-FE").getOrCreate()

df = spark.read.csv(
    "D:\Study\Centillion\Pyspark\Datasets\hybrid_manufacturing_categorical.csv",
    header=True,
    inferSchema=True
)

df.show(10)



+------+----------+--------------+-------------+---------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------+---------------------+
|Job_ID|Machine_ID|Operation_Type|Material_Used|Processing_Time|Energy_Consumption|Machine_Availability|    Scheduled_Start|      Scheduled_End|       Actual_Start|         Actual_End|Job_Status|Optimization_Category|
+------+----------+--------------+-------------+---------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------+---------------------+
|  J001|       M01|      Grinding|         3.17|             76|             11.42|                  96|2023-03-18 08:00:00|2023-03-18 09:16:00|2023-03-18 08:05:00|2023-03-18 09:21:00| Completed|  Moderate Efficiency|
|  J002|       M01|      Grinding|         3.35|             79|              6.61|                  84|2023-03-18 08:10:00|2023

In [2]:
df = df.withColumn(
    "Planned_Duration_Min",
    (df["Scheduled_End"].cast("long") - df["Scheduled_Start"].cast("long")) / 60
)

df = df.withColumn(
    "Actual_Duration_Min",
    (df["Actual_End"].cast("long") - df["Actual_Start"].cast("long")) / 60
)

df = df.withColumn(
    "Delay_Min",
    df["Actual_Duration_Min"] - df["Planned_Duration_Min"]
)


df.show(10)

+------+----------+--------------+-------------+---------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------+---------------------+--------------------+-------------------+---------+
|Job_ID|Machine_ID|Operation_Type|Material_Used|Processing_Time|Energy_Consumption|Machine_Availability|    Scheduled_Start|      Scheduled_End|       Actual_Start|         Actual_End|Job_Status|Optimization_Category|Planned_Duration_Min|Actual_Duration_Min|Delay_Min|
+------+----------+--------------+-------------+---------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------+---------------------+--------------------+-------------------+---------+
|  J001|       M01|      Grinding|         3.17|             76|             11.42|                  96|2023-03-18 08:00:00|2023-03-18 09:16:00|2023-03-18 08:05:00|2023-03-18 09:21:00| Complete

In [3]:
from pyspark.sql.functions import when
from pyspark.sql.functions import col

df = df.withColumn(
    "Is_Delayed",
    when(col("Delay_Min") > 0, 1).otherwise(0)
)

df = df.withColumn(
    "Energy_per_Min",
    col("Energy_Consumption") / col("Processing_Time")
)

df = df.withColumn(
    "High_Machine_Load",
    when(col("Machine_Availability") < 85, 1).otherwise(0)
)

df = df.fillna({
    "Actual_Duration_Min": 0,
    "Delay_Min": 0,
    "Energy_per_Min": 0
})


df.show(10)

+------+----------+--------------+-------------+---------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------+---------------------+--------------------+-------------------+---------+----------+-------------------+-----------------+
|Job_ID|Machine_ID|Operation_Type|Material_Used|Processing_Time|Energy_Consumption|Machine_Availability|    Scheduled_Start|      Scheduled_End|       Actual_Start|         Actual_End|Job_Status|Optimization_Category|Planned_Duration_Min|Actual_Duration_Min|Delay_Min|Is_Delayed|     Energy_per_Min|High_Machine_Load|
+------+----------+--------------+-------------+---------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------+---------------------+--------------------+-------------------+---------+----------+-------------------+-----------------+
|  J001|       M01|      Grinding|         3.1

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

categorical_cols = [
    "Machine_ID",
    "Operation_Type",
    "Job_Status",
    "Optimization_Category"
]

indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_Index", handleInvalid="keep")
    for c in categorical_cols
]



feature_cols = [
    "Processing_Time",
    "Energy_Consumption",
    "Machine_Availability",
    "Planned_Duration_Min",
    "Actual_Duration_Min",
    "Delay_Min",
    "Energy_per_Min",
    "High_Machine_Load",
    "Machine_ID_Index",
    "Operation_Type_Index"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)




pipeline = Pipeline(stages=indexers + [assembler])

fe_model = pipeline.fit(df)
final_df = fe_model.transform(df)


final_df=final_df.select(
    "features",
    "Job_Status_Index"
)

final_df.show()




+--------------------+----------------+
|            features|Job_Status_Index|
+--------------------+----------------+
|[76.0,11.42,96.0,...|             0.0|
|[79.0,6.61,84.0,7...|             1.0|
|[56.0,11.11,92.0,...|             2.0|
|[106.0,12.5,95.0,...|             0.0|
|[46.0,8.13,88.0,4...|             0.0|
|[100.0,13.83,86.0...|             0.0|
|[22.0,14.2,87.0,2...|             0.0|
|[79.0,13.86,91.0,...|             0.0|
|[42.0,8.97,81.0,4...|             0.0|
|[27.0,3.66,97.0,2...|             1.0|
|[65.0,9.98,90.0,6...|             0.0|
|[28.0,8.08,88.0,2...|             1.0|
|[43.0,5.55,95.0,4...|             0.0|
|[112.0,2.01,95.0,...|             0.0|
|[104.0,3.67,98.0,...|             0.0|
|[65.0,12.29,96.0,...|             0.0|
|[115.0,9.44,80.0,...|             0.0|
|[55.0,10.95,94.0,...|             0.0|
|[52.0,3.27,82.0,5...|             0.0|
|[42.0,14.19,91.0,...|             0.0|
+--------------------+----------------+
only showing top 20 rows



Training a ML Model


In [14]:
from pyspark.ml.classification import LogisticRegression

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="Job_Status_Index",
    maxIter=50
)

lr_model = lr.fit(train_df)


predictions = lr_model.transform(test_df)

predictions.show(10)


+--------------------+----------------+--------------------+--------------------+----------+
|            features|Job_Status_Index|       rawPrediction|         probability|prediction|
+--------------------+----------------+--------------------+--------------------+----------+
|[20.0,4.29,97.0,2...|             0.0|[33.0492472156852...|[0.72140683577082...|       0.0|
|[20.0,9.81,86.0,2...|             1.0|[33.0594798494235...|[0.79679072503260...|       0.0|
|[20.0,13.63,85.0,...|             2.0|[6.15349473156470...|[5.8722039797885E...|       2.0|
|[21.0,7.47,83.0,2...|             0.0|[33.1197740067122...|[0.78109704005819...|       0.0|
|[21.0,12.69,81.0,...|             2.0|[5.56118813954012...|[1.25634218864955...|       2.0|
|[22.0,9.51,92.0,2...|             2.0|[3.69832922740026...|[1.84589411720965...|       2.0|
|[23.0,3.92,88.0,2...|             1.0|[36.2251680527321...|[0.84187067251876...|       0.0|
|[23.0,8.52,81.0,2...|             2.0|[4.69038905149306...|[1.3552213

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Job_Status_Index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")


+--------------------+----------------+--------------------+--------------------+----------+
|            features|Job_Status_Index|       rawPrediction|         probability|prediction|
+--------------------+----------------+--------------------+--------------------+----------+
|[20.0,4.29,97.0,2...|             0.0|[33.0492472156852...|[0.72140683577082...|       0.0|
|[20.0,9.81,86.0,2...|             1.0|[33.0594798494235...|[0.79679072503260...|       0.0|
|[20.0,13.63,85.0,...|             2.0|[6.15349473156470...|[5.8722039797885E...|       2.0|
|[21.0,7.47,83.0,2...|             0.0|[33.1197740067122...|[0.78109704005819...|       0.0|
|[21.0,12.69,81.0,...|             2.0|[5.56118813954012...|[1.25634218864955...|       2.0|
|[22.0,9.51,92.0,2...|             2.0|[3.69832922740026...|[1.84589411720965...|       2.0|
|[23.0,3.92,88.0,2...|             1.0|[36.2251680527321...|[0.84187067251876...|       0.0|
|[23.0,8.52,81.0,2...|             2.0|[4.69038905149306...|[1.3552213

In [21]:
#random_forest_classifier
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="Job_Status_Index",
    numTrees=100
)

rf_mdodel = rf.fit(train_df)
predictions_rf = rf_mdodel.transform(test_df)
predictions_rf.show(5)

accuracy_rf = evaluator.evaluate(predictions_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf}")

+--------------------+----------------+--------------------+--------------------+----------+
|            features|Job_Status_Index|       rawPrediction|         probability|prediction|
+--------------------+----------------+--------------------+--------------------+----------+
|[20.0,4.29,97.0,2...|             0.0|[78.1108861654432...|[0.78110886165443...|       0.0|
|[20.0,9.81,86.0,2...|             1.0|[77.9428202601710...|[0.77942820260171...|       0.0|
|[20.0,13.63,85.0,...|             2.0|[6.65397200352587...|[0.06653972003525...|       2.0|
|[21.0,7.47,83.0,2...|             0.0|[78.0990325192983...|[0.78099032519298...|       0.0|
|[21.0,12.69,81.0,...|             2.0|[8.25501016508196...|[0.08255010165081...|       2.0|
+--------------------+----------------+--------------------+--------------------+----------+
only showing top 5 rows

Random Forest Model Accuracy: 0.7592592592592593
