In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.1'
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [1 InRelease 14.2 kB/129 kB 11%] [Connected t0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connected to cloud.r-project.org (108.138.12                                                                                                    Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy Release
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launch

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Download the CSV files from GitHub
!wget -O california_population_by_county.csv "https://raw.githubusercontent.com/Sarah0215/Project4_Group9/main/california_population_by_county.csv"
!wget -O map_data_final.csv "https://raw.githubusercontent.com/Sarah0215/Project4_Group9/main/map_data_final.csv"
!wget -O temperature_map_data.csv "https://raw.githubusercontent.com/Sarah0215/Project4_Group9/main/temperature_map_data.csv"


--2024-07-22 22:08:48--  https://raw.githubusercontent.com/Sarah0215/Project4_Group9/main/california_population_by_county.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2717 (2.7K) [text/plain]
Saving to: ‘california_population_by_county.csv’


2024-07-22 22:08:48 (33.4 MB/s) - ‘california_population_by_county.csv’ saved [2717/2717]

--2024-07-22 22:08:48--  https://raw.githubusercontent.com/Sarah0215/Project4_Group9/main/map_data_final.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 700158 (684K) [text/plain]
Saving to: 

In [4]:
# Load the CSV files into Spark DataFrames
population_data = spark.read.csv('california_population_by_county.csv', sep=',', header=True)
wildfire_data = spark.read.csv('map_data_final.csv', sep=',', header=True)
temperature_data = spark.read.csv('temperature_map_data.csv', sep=',', header=True)

# Show the first few rows of each DataFrame
population_data.show(5)
wildfire_data.show(5)
temperature_data.show(5)


+--------------------+---------------+-----+-------+
|             County0|TotalPopulation|state|county3|
+--------------------+---------------+-----+-------+
|Alameda County, C...|      1663823.0|   06|    001|
|Alpine County, Ca...|         1515.0|   06|    003|
|Amador County, Ca...|        40577.0|   06|    005|
|Butte County, Cal...|       213605.0|   06|    007|
|Calaveras County,...|        45674.0|   06|    009|
+--------------------+---------------+-----+-------+
only showing top 5 rows

+-------------+-----------------+-------------------------+---------------------+----------------------------+---------------+---------------------+--------------------+----------------+-----------------------------+------------------+-----------------+-------------+--------------------+--------------------------+------------------------------+-------------------------+---------+
|incident_name|incident_is_final|incident_date_last_update|incident_date_created|incident_administrative_unit|incid

In [5]:
# Convert Spark DataFrames to Pandas DataFrames
population_data = population_data.toPandas()
wildfire_data = wildfire_data.toPandas()
temperature_data = temperature_data.toPandas()

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib

In [7]:
# Clean and preprocess population data
# Remove the suffix ' County, California' from the County column to match with the incident_county column in wildfire_data
population_data['County0'] = population_data['County0'].str.replace(' County, California', '')

In [8]:
# Merge wildfire and population data
# Combine the wildfire and population data based on the incident county to get population information for each wildfire incident
merged_data = pd.merge(wildfire_data, population_data, left_on='incident_county', right_on='County0', how='left')

In [9]:
# Drop unnecessary columns
# Remove columns that are not needed for the analysis
merged_data = merged_data.drop(['state', 'county3', 'County0', 'incident_control', 'incident_dateonly_extinguished', 'incident_dateonly_created', 'incident_type'], axis=1)

In [10]:
# Rename columns for clarity
merged_data.rename(columns={'TotalPopulation': 'county_population'}, inplace=True)

In [11]:
# Merge temperature data
# Add temperature data to the merged_data based on incident_id to include weather information
final_data = pd.merge(merged_data, temperature_data[['incident_id', 'mean_temperature']], on='incident_id', how='left')
final_data.head()

Unnamed: 0,incident_name,incident_is_final,incident_date_last_update,incident_date_created,incident_administrative_unit,incident_county,incident_acres_burned,incident_containment,incident_cooperating_agencies,incident_longitude,incident_latitude,incident_id,incident_date_extinguished,is_active,county_population,mean_temperature
0,Bridge Fire,Y,2018-01-09 13:46:00+00:00,2017-10-31 11:22:00+00:00,Shasta-Trinity National Forest,Shasta,37.0,100.0,Shasta-Trinity National Forest,-122.309,40.774,2ca11d45-8139-4c16-8af0-880d99b21e82,2018-01-09 13:46:00+00:00,N,181852.0,9.712425
1,Pala Fire,Y,2020-09-16 14:07:35+00:00,2009-05-24 14:56:00+00:00,CAL FIRE San Diego Unit,San Diego,122.0,100.0,CAL FIRE San Diego Unit,1.0,1.0,8f61f461-552d-4538-b186-35ab030da416,2009-05-25 00:00:00+00:00,N,3289701.0,26.150545
2,River Fire,Y,2022-10-24 11:39:23+00:00,2013-02-24 08:16:00+00:00,CAL FIRE San Bernardino Unit,Inyo,407.0,100.0,"CAL FIRE San Bernardino Unit, Inyo County Sher...",-118.01651,36.602575,094719ba-a47b-4abb-9ec5-a506b2b9fd23,2013-02-28 20:00:00+00:00,N,18829.0,18.313614
3,Fawnskin Fire,Y,2013-04-22 09:00:00+00:00,2013-04-20 17:30:00+00:00,San Bernardino National Forest,San Bernardino,30.0,100.0,San Bernardino National Forest,-116.941311,34.288877,58f89ff8-bd3e-4355-b1c0-8fa05c747d3f,2013-04-22 09:00:00+00:00,N,2180563.0,11.841945
4,Gold Fire,Y,2013-05-01 07:00:00+00:00,2013-04-30 12:59:00+00:00,CAL FIRE Madera-Mariposa-Merced Unit,Madera,274.0,100.0,CAL FIRE Madera-Mariposa-Merced Unit,-119.635004,37.116295,357ffc13-bef9-48eb-810f-c5de851972eb,2013-05-01 07:00:00+00:00,N,157243.0,21.468794


In [12]:
# Convert date columns to datetime and calculate the containment time
final_data['incident_date_created'] = pd.to_datetime(final_data['incident_date_created'], errors='coerce')
final_data['incident_date_extinguished'] = pd.to_datetime(final_data['incident_date_extinguished'], errors='coerce')


In [13]:
# Calculate the containment time in hours
final_data['containment_time'] = (final_data['incident_date_extinguished'] - final_data['incident_date_created']).dt.total_seconds() / 3600

In [14]:
# Drop rows where the date conversion resulted in NaT
final_data.dropna(subset=['incident_date_created', 'incident_date_extinguished', 'containment_time'], inplace=True)

In [15]:
# Extract additional date features from incident_date_created
# Create new features based on the creation date of the incident for better temporal analysis
final_data['day_of_year_created'] = final_data['incident_date_created'].dt.dayofyear
final_data['day_of_week_created'] = final_data['incident_date_created'].dt.dayofweek
final_data['month_created'] = final_data['incident_date_created'].dt.month
final_data['year_created'] = final_data['incident_date_created'].dt.year

In [16]:
# Extract additional date features from incident_date_extinguished
# Create new features based on the extinguished date of the incident for better temporal analysis
final_data['day_of_year_extinguished'] = final_data['incident_date_extinguished'].dt.dayofyear
final_data['day_of_week_extinguished'] = final_data['incident_date_extinguished'].dt.dayofweek
final_data['month_extinguished'] = final_data['incident_date_extinguished'].dt.month
final_data['year_extinguished'] = final_data['incident_date_extinguished'].dt.year

In [17]:
# Drop rows with missing target values
# Remove rows where containment_time is NaN
final_data = final_data.dropna(subset=['containment_time'])

# Drop rows with missing feature values
# Remove rows where any of the key feature columns have NaN values
final_data = final_data.dropna(subset=['incident_acres_burned', 'county_population', 'mean_temperature'])

In [18]:
# Classify containment time into bins
# Create a new column 'containment_time_class' by binning the containment_time into 'short', 'medium', and 'long'
bins = [0, 24, 72, float('inf')]
labels = ['short', 'medium', 'long']
final_data['containment_time_class'] = pd.cut(final_data['containment_time'], bins=bins, labels=labels, include_lowest=True)

In [19]:
# Remove any rows where the target is NaN
# Ensure that there are no NaN values in the containment_time_class column
final_data = final_data.dropna(subset=['containment_time_class'])

In [20]:
# Select features and target variable
features = [
    'incident_acres_burned', 'county_population', 'mean_temperature',
    'incident_latitude', 'incident_longitude', 'day_of_year_created',
    'day_of_week_created', 'month_created', 'year_created',
    'day_of_year_extinguished', 'day_of_week_extinguished',
    'month_extinguished', 'year_extinguished'
]
target = 'containment_time_class'

X = final_data[features]
y = final_data[target]

# Encode categorical target
y = y.astype('category').cat.codes

In [21]:
# Ensure correct unique mapping of labels
# Create a mapping from numerical codes to original class labels for interpretation
unique_classes = y.unique()
unique_classes_labels = {code: labels[code] for code in range(len(labels))}

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
# Adjust the hyperparameters for tuning
model = RandomForestClassifier(
    n_estimators=1000,           # Number of trees in the forest
    max_depth=50,               # Maximum depth of the tree
    min_samples_split=10,        # Minimum number of samples required to split an internal node
    min_samples_leaf=5,         # Minimum number of samples required to be at a leaf node
    max_features='log2',        # Number of features to consider when looking for the best split
    random_state=42
)
model.fit(X_train, y_train)

In [24]:
# Make predictions
y_pred = model.predict(X_test)

In [25]:
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=labels))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       short       0.61      0.61      0.61        87
      medium       0.41      0.38      0.39        82
        long       0.87      0.89      0.88       305

    accuracy                           0.75       474
   macro avg       0.63      0.62      0.63       474
weighted avg       0.74      0.75      0.74       474

[[ 53  19  15]
 [ 24  31  27]
 [ 10  25 270]]


In [26]:
# Use cross-validation for a more reliable performance estimate
# Perform cross-validation and print the average accuracy and standard deviation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Cross-validated accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}')

Cross-validated accuracy: 0.76 ± 0.02


In [27]:
# Save the model
joblib_file = "spark_cawildfire_random_forest_model.pkl"
joblib.dump(model, joblib_file, protocol=4)

['spark_cawildfire_random_forest_model.pkl']

In [28]:
# Save the final_data DataFrame to a CSV file
final_data.to_csv('spark_cawildfire_percent75_data.csv', index=False)

In [29]:
final_data.head()

Unnamed: 0,incident_name,incident_is_final,incident_date_last_update,incident_date_created,incident_administrative_unit,incident_county,incident_acres_burned,incident_containment,incident_cooperating_agencies,incident_longitude,...,containment_time,day_of_year_created,day_of_week_created,month_created,year_created,day_of_year_extinguished,day_of_week_extinguished,month_extinguished,year_extinguished,containment_time_class
0,Bridge Fire,Y,2018-01-09 13:46:00+00:00,2017-10-31 11:22:00+00:00,Shasta-Trinity National Forest,Shasta,37.0,100.0,Shasta-Trinity National Forest,-122.309,...,1682.4,304,1,10,2017,9,1,1,2018,long
1,Pala Fire,Y,2020-09-16 14:07:35+00:00,2009-05-24 14:56:00+00:00,CAL FIRE San Diego Unit,San Diego,122.0,100.0,CAL FIRE San Diego Unit,1.0,...,9.066667,144,6,5,2009,145,0,5,2009,short
2,River Fire,Y,2022-10-24 11:39:23+00:00,2013-02-24 08:16:00+00:00,CAL FIRE San Bernardino Unit,Inyo,407.0,100.0,"CAL FIRE San Bernardino Unit, Inyo County Sher...",-118.01651,...,107.733333,55,6,2,2013,59,3,2,2013,long
3,Fawnskin Fire,Y,2013-04-22 09:00:00+00:00,2013-04-20 17:30:00+00:00,San Bernardino National Forest,San Bernardino,30.0,100.0,San Bernardino National Forest,-116.941311,...,39.5,110,5,4,2013,112,0,4,2013,medium
4,Gold Fire,Y,2013-05-01 07:00:00+00:00,2013-04-30 12:59:00+00:00,CAL FIRE Madera-Mariposa-Merced Unit,Madera,274.0,100.0,CAL FIRE Madera-Mariposa-Merced Unit,-119.635004,...,18.016667,120,1,4,2013,121,2,5,2013,short
