In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, when, exp, radians, cos, sin, mean, avg, max as spark_max, min as spark_min,
    sum as spark_sum, lag
)
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import year
import matplotlib.pyplot as plt
from pyspark.sql.functions import count, desc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data Loading

In [None]:
spark = SparkSession.builder.appName("wildfire project").getOrCreate()

#inside data is all processed_data file, i.e. part-00000-d930...
data_dir = "/content/data"

df = spark.read.csv(data_dir, header=True, inferSchema=True).cache()

# df.show(10)

In [None]:
# List of columns to keep
columns_to_keep = [
    "new_cumulative_precipitation", "et0_fao_evapotranspiration", "temperature_2m_mean", "shortwave_radiation_sum",
    "temperature_2m_min", "temperature_2m_max", "new_dryness_index", "daylight_duration",
    "new_relative_humidity", "new_precip_radiation_ratio", "wind_gusts_10m_max",
    "sunshine_duration", "weather_latitude", "weather_longitude","wind_speed_10m_max","in_modis"
]

# maybe consider adding 'date'? cuz fire has something to do with seasons?
# One thing I could try: extract month from the date. Then categorize it into
# seasons: spring, summer, fall, winter. In my knowledge, wildfire occurs mostly
# in summer and fall.

# Select only the desired columns
df = df.select([col for col in columns_to_keep if col in df.columns])

df = df.withColumnRenamed("in_modis", "wildfire")

df.show(10)

+----------------------------+--------------------------+-------------------+-----------------------+------------------+------------------+------------------+-----------------+---------------------+--------------------------+------------------+-----------------+----------------+-----------------+------------------+--------+
|new_cumulative_precipitation|et0_fao_evapotranspiration|temperature_2m_mean|shortwave_radiation_sum|temperature_2m_min|temperature_2m_max| new_dryness_index|daylight_duration|new_relative_humidity|new_precip_radiation_ratio|wind_gusts_10m_max|sunshine_duration|weather_latitude|weather_longitude|wind_speed_10m_max|wildfire|
+----------------------------+--------------------------+-------------------+-----------------------+------------------+------------------+------------------+-----------------+---------------------+--------------------------+------------------+-----------------+----------------+-----------------+------------------+--------+
|                     

In [None]:
df = df.withColumn("wildfire", when(col("wildfire"), 1).otherwise(0))

In [None]:
df.show(10)

+----------------------------+--------------------------+-------------------+-----------------------+------------------+------------------+------------------+-----------------+---------------------+--------------------------+------------------+-----------------+----------------+-----------------+------------------+--------+
|new_cumulative_precipitation|et0_fao_evapotranspiration|temperature_2m_mean|shortwave_radiation_sum|temperature_2m_min|temperature_2m_max| new_dryness_index|daylight_duration|new_relative_humidity|new_precip_radiation_ratio|wind_gusts_10m_max|sunshine_duration|weather_latitude|weather_longitude|wind_speed_10m_max|wildfire|
+----------------------------+--------------------------+-------------------+-----------------------+------------------+------------------+------------------+-----------------+---------------------+--------------------------+------------------+-----------------+----------------+-----------------+------------------+--------+
|                     

# Data Cleaning


In [None]:
# Check for null values in each column
for col_name in df.columns:
    num_nulls = df.filter(F.col(col_name).isNull()).count()
    if num_nulls > 0:
        print(f"Column '{col_name}' has {num_nulls} null values.")

#omit na row
df = df.dropna()

print("na has been removed")

Column 'new_dryness_index' has 1 null values.
na has been removed


In [None]:
# Checking for class imbalance
wildfire_count = df.filter(col("wildfire") == 1).count()
non_wildfire_count = df.filter(col("wildfire") == 0).count()

# calculate the ratio
imbalance_ratio = wildfire_count / non_wildfire_count

print(f"Wildfire count: {wildfire_count}")
print(f"Non-wildfire count: {non_wildfire_count}")
print(f"Imbalance ratio: {imbalance_ratio}")

Wildfire count: 38082
Non-wildfire count: 61595
Imbalance ratio: 0.6182644695186298


The ratio between wildfire and non-wildfre is 0.61, indicating there is no class imbalance problem that can cause a significant impact on the prediction.

# Data Split

In [None]:
train_df, validation_df = df.randomSplit([0.8, 0.2], seed=123)

# Training

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# parameter tunning
paramGrid = ParamGridBuilder() \
    .addGrid(svm.maxIter, [5, 10, 20]) \
    .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
    .build()

evaluator = BinaryClassificationEvaluator(labelCol="wildfire", rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=svm, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

cvModel = cv.fit(train_data)
svm_model = cvModel.bestModel

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [10, 20, 50]) \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

evaluator = BinaryClassificationEvaluator(labelCol="wildfire", rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

cvModel = cv.fit(train_data)

lr_model = cvModel.bestModel

In [None]:
# parameters used for the best models

svm_best_params = svm_model.extractParamMap()
print("Best parameters for SVM:")
for param, value in svm_best_params.items():
    print(f"{param.name}: {value}")

lr_best_params = lr_model.extractParamMap()
print("\nBest parameters for Logistic Regression:")
for param, value in lr_best_params.items():
    print(f"{param.name}: {value}")

Best parameters for SVM:
aggregationDepth: 2
featuresCol: features
fitIntercept: True
labelCol: wildfire
maxBlockSizeInMB: 0.0
maxIter: 20
predictionCol: prediction
rawPredictionCol: rawPrediction
regParam: 0.01
standardization: True
threshold: 0.0
tol: 1e-06

Best parameters for Logistic Regression:
aggregationDepth: 2
elasticNetParam: 0.0
family: auto
featuresCol: features
fitIntercept: True
labelCol: wildfire
maxBlockSizeInMB: 0.0
maxIter: 50
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
regParam: 0.01
standardization: True
threshold: 0.5
tol: 1e-06


# Performance

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# SVM model
svm_predictions = svm_model.transform(validation_data)
evaluator = BinaryClassificationEvaluator(labelCol='wildfire', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
svm_auc = evaluator.evaluate(svm_predictions)
evaluator = MulticlassClassificationEvaluator(labelCol='wildfire', predictionCol='prediction', metricName='accuracy')
svm_accuracy = evaluator.evaluate(svm_predictions)

# Logistic Regression model
lr_predictions = lr_model.transform(validation_data)
evaluator = BinaryClassificationEvaluator(labelCol='wildfire', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
lr_auc = evaluator.evaluate(lr_predictions)
evaluator = MulticlassClassificationEvaluator(labelCol='wildfire', predictionCol='prediction', metricName='accuracy')
lr_accuracy = evaluator.evaluate(lr_predictions)

# Print the results
print(f"Tuned SVM AUC: {svm_auc}")
print(f"Tuned SVM Accuracy: {svm_accuracy}")
print(f"Tuned Logistic Regression AUC: {lr_auc}")
print(f"Tuned Logistic Regression Accuracy: {lr_accuracy}")

Tuned SVM AUC: 0.8534608156446623
Tuned SVM Accuracy: 0.785423473234481
Tuned Logistic Regression AUC: 0.8597033542072159
Tuned Logistic Regression Accuracy: 0.777029404372958
