### AIT 614 - Big Data Essentials <br>
#### Project Title: FHWA Bridge Conditions Analysis Using Big Data Techniques
#### 2. Spark MLib
#### TEAM 4
<hr>

Course Section #: AIT 614 - 003 <br>
#### Team Members
1. Aryan Patel Kolagani - G01517560 <br>
2. Rithvik Madhavaram - G01501806 <br>
3. Chetan Muppavarapu - G01504057 <br>
4. Srivaths Nrusimha Rao Chengal - G01512113 <br>
5. Vaibhav Hasu - G01517039 <br>

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Loading dataset
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/akolagan@gmu.edu/FHWA_Bridge_Conditions_Dataset.csv")

# Cleaning all numeric columns
columns_to_clean = [
    "Age_Years", "Daily_Traffic", "Repair_Cost_USD", "Truck_Percentage", 
    "Deterioration_Rate", "Deck_Condition", "Length_Meters", "Repair_Time_Days"
]

for col_name in columns_to_clean:
    df = df.withColumn(f"{col_name}_clean", regexp_replace(col(col_name), "[$,%]", ""))
    df = df.withColumn(f"{col_name}_clean", col(f"{col_name}_clean").cast("double"))

# Droping rows with nulls in key fields
df_clean = df.dropna(subset=[f"{c}_clean" for c in columns_to_clean] + ["Material"])

# Renaming cleaned columns
df_clean = df_clean \
    .drop(*columns_to_clean) \
    .withColumnRenamed("Age_Years_clean", "Age_Years") \
    .withColumnRenamed("Daily_Traffic_clean", "Daily_Traffic") \
    .withColumnRenamed("Repair_Cost_USD_clean", "Repair_Cost_USD") \
    .withColumnRenamed("Truck_Percentage_clean", "Truck_Percentage") \
    .withColumnRenamed("Deterioration_Rate_clean", "Deterioration_Rate") \
    .withColumnRenamed("Deck_Condition_clean", "Deck_Condition") \
    .withColumnRenamed("Length_Meters_clean", "Length_Meters") \
    .withColumnRenamed("Repair_Time_Days_clean", "Repair_Time_Days")

# Encoding the 'Material' column
indexer = StringIndexer(inputCol="Material", outputCol="Material_Indexed")
df_encoded = indexer.fit(df_clean).transform(df_clean)

# Assembling feature vector
feature_cols = [
    "Age_Years", "Daily_Traffic", "Repair_Cost_USD", "Truck_Percentage", 
    "Deterioration_Rate", "Length_Meters", "Repair_Time_Days", "Material_Indexed"
]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(df_encoded).select("Bridge_ID", "Deck_Condition", "features")

# KMeans Clustering
kmeans = KMeans(k=4, seed=42, featuresCol="features")
kmeans_model = kmeans.fit(df_features)
df_clustered = kmeans_model.transform(df_features)

# Saving cluster output 
df_clustered.select("Bridge_ID", "Deck_Condition", "prediction") \
    .write.mode("overwrite").csv("/dbfs/FileStore/bridge_cluster_output.csv", header=True)

# Decision tree regression
df_regression = df_clustered.drop("prediction")

train_dt, test_dt = df_regression.randomSplit([0.8, 0.2], seed=123)
dt = DecisionTreeRegressor(featuresCol="features", labelCol="Deck_Condition")
dt_model = dt.fit(train_dt)
dt_predictions = dt_model.transform(test_dt)

# Evaluating Decision Tree
evaluator = RegressionEvaluator(labelCol="Deck_Condition", predictionCol="prediction")
rmse_dt = evaluator.setMetricName("rmse").evaluate(dt_predictions)
r2_dt = evaluator.setMetricName("r2").evaluate(dt_predictions)

print(f"ðŸŒ² Decision Tree RMSE: {rmse_dt}")
print(f"ðŸŒ² Decision Tree RÂ²: {r2_dt}")

# Saving Decision Tree predictions
dt_predictions.select("Deck_Condition", "prediction") \
    .write.mode("overwrite").csv("/dbfs/FileStore/deck_condition_predictions_dt.csv", header=True)

# Random Forest Regression
train_rf, test_rf = df_regression.randomSplit([0.8, 0.2], seed=123)
rf = RandomForestRegressor(featuresCol="features", labelCol="Deck_Condition", numTrees=50)
rf_model = rf.fit(train_rf)
rf_predictions = rf_model.transform(test_rf)

# Evaluating Random Forest
rmse_rf = evaluator.setMetricName("rmse").evaluate(rf_predictions)
r2_rf = evaluator.setMetricName("r2").evaluate(rf_predictions)

print(f"ðŸŒ²ðŸŒ² Random Forest RMSE: {rmse_rf}")
print(f"ðŸŒ²ðŸŒ² Random Forest RÂ²: {r2_rf}")

# Saving Random Forest predictions
rf_predictions.select("Deck_Condition", "prediction") \
    .write.mode("overwrite").csv("/dbfs/FileStore/deck_condition_predictions_rf.csv", header=True)

# Feature importances
print("ðŸŽ¯ Random Forest Feature Importances:")
print(rf_model.featureImportances)

ðŸŒ² Decision Tree RMSE: 1.7279883119350314
ðŸŒ² Decision Tree RÂ²: -0.06034006342289566
ðŸŒ²ðŸŒ² Random Forest RMSE: 1.6827032018629609
ðŸŒ²ðŸŒ² Random Forest RÂ²: -0.005491982326457423
ðŸŽ¯ Random Forest Feature Importances:
(8,[0,1,2,3,4,5,6,7],[0.12403230123124924,0.13536675037554405,0.12376594697290043,0.11523633305261992,0.16931762920872434,0.12251391006732983,0.1285627380364993,0.08120439105513279])


**Clustered Dataframe**

In [0]:
print("Clustered DataFrame:")
df_clustered.show(5)
print("Random Forest Predictions:")
rf_predictions.select("Deck_Condition", "prediction").show(5)

Clustered DataFrame:
+---------+--------------+--------------------+----------+
|Bridge_ID|Deck_Condition|            features|prediction|
+---------+--------------+--------------------+----------+
|   100000|           6.0|[17.0,12899.0,431...|         0|
|   100001|           6.0|[52.0,47458.0,188...|         1|
|   100002|           3.0|[35.0,68505.0,364...|         2|
|   100003|           4.0|[60.0,10563.0,182...|         1|
|   100004|           8.0|[90.0,157293.0,18...|         1|
+---------+--------------+--------------------+----------+
only showing top 5 rows

Random Forest Predictions:
+--------------+------------------+
|Deck_Condition|        prediction|
+--------------+------------------+
|           3.0| 5.497897332790483|
|           8.0| 5.365861276270184|
|           7.0| 5.386994707482964|
|           4.0| 6.255114718875766|
|           3.0|5.5322452783240115|
+--------------+------------------+
only showing top 5 rows

