get closest weather stations for each fire station

In [1]:
import numpy as np
import pandas as pd

# Load CSV files
fire_stations = pd.read_csv("all fire stations in area_ fire.csv")
weather_stations = pd.read_csv("ghcnd-stations.csv")

def dist(x, y):
    """Calculate Euclidean distance between two coordinate points."""
    return np.sqrt((x[0] - y[0])**2 + (x[1] - y[1])**2)

def find_closest(fire_stations: pd.DataFrame, weather_stations: pd.DataFrame) -> pd.DataFrame:
    """
    Find the closest weather station to each fire station.
    
    Parameters:
        fire_stations (pd.DataFrame): DataFrame with columns ['fire_id', 'fire_name', 'lat', 'lon']
        weather_stations (pd.DataFrame): DataFrame with columns ['weather_id', 'lat', 'lon']
    
    Returns:
        pd.DataFrame: DataFrame with ['fire_id', 'fire_name', 'closest_weather_id', 'distance']
    """
    results = []

    for _, fire in fire_stations.iterrows():
        fire_coords = (fire["lat"], fire["lon"])
        min_dist = float("inf")
        closest_weather = None

        for _, weather in weather_stations.iterrows():
            weather_coords = (weather["lat"], weather["lon"])
            distance = dist(fire_coords, weather_coords)

            if distance < min_dist:
                min_dist = distance
                closest_weather = weather["weather_id"]

        # ✅ Fix: Correct column access using `.loc`
        results.append([fire["fire_id"], fire["fire_name"], closest_weather, min_dist])

    # Convert results to DataFrame
    closest_df = pd.DataFrame(results, columns=["fire_id", "fire_name", "closest_weather_id", "distance"])
    return closest_df

# Run function
closest_df = find_closest(fire_stations, weather_stations)
print(closest_df)



KeyboardInterrupt: 

In [None]:
fireweather_conv=closest_df

fetch weather data from NOAA ftp with the list of station id

In [None]:
weatherstn_list = closest_df['closest_weather_id']


In [None]:
import os
import requests

# Base URL
base_url = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_station/"



# Directory to save downloaded files
download_dir = "weather_ftpfetched"
os.makedirs(download_dir, exist_ok=True)

for station_id in weatherstn_list:
    file_name = f"{station_id}.csv.gz"  # NOAA files are in .csv.gz format
    file_url = base_url + file_name
    local_file_path = os.path.join(download_dir, file_name)

    # Download the file
    response = requests.get(file_url, stream=True)
    
    if response.status_code == 200:
        with open(local_file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded: {file_name}")
    else:
        print(f"Failed to download: {file_name} (Status Code: {response.status_code})")



convert csv.gz to csv

In [None]:
import os
import gzip
import pandas as pd

# Define the source folder containing .csv.gz files
data_folder = "weather_ftpfetched"

# Define the destination folder for converted .csv files
converted_folder = "weathercsv_converted"

# Create the converted folder if it doesn't exist
os.makedirs(converted_folder, exist_ok=True)

# Get a list of all .csv.gz files in the data folder
all_files = [f for f in os.listdir(data_folder) if f.endswith('.csv.gz')]

# Process each .csv.gz file
for file in all_files:
    input_path = os.path.join(data_folder, file)  # Full path to input file
    output_filename = file.replace(".csv.gz", ".csv")  # Change file extension
    output_path = os.path.join(converted_folder, output_filename)  # Full path to output file

    # Open the .gz file and read it using pandas
    with gzip.open(input_path, 'rt', encoding='utf-8') as f:  # Read in text mode
        try:
            # Read the CSV file, skipping bad lines
            df = pd.read_csv(f, low_memory=False, on_bad_lines='skip', sep=',')

            # Print row count for debugging
            print(f"✅ Read {file} with {len(df)} rows.")

            # Save the converted .csv file
            df.to_csv(output_path, index=False)

            print(f"📁 Saved converted file to: {output_path}")

        except Exception as e:
            print(f"❌ Error processing {file}: {e}")


next step: combine all the csv into 1

In [None]:
import os
import pandas as pd

# Define the folder where your CSV files are stored
data_folder = "weathercsv_converted"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith(".csv")]

# Define the expected columns
expected_columns = ['id', 'date', 'obs', 'obs_value']

# Iterate through all CSV files to ensure they have the same structure
for file in csv_files:
    file_path = os.path.join(data_folder, file)
    
    try:
        # Read the current CSV file
        df = pd.read_csv(file_path,low_memory=False)
        
        # Drop columns with 'Unnamed' in the name (extra columns)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        
        # Check if the number of columns matches the expected structure
        if len(df.columns) >= 4:
            # Ensure the first four columns are the expected ones
            df = df.iloc[:, :4]  # Select the first 4 columns
            df.columns = expected_columns  # Rename the columns

            # Save the fixed CSV file
            df.to_csv(file_path, index=False)
            print(f"Fixed and saved {file}")
        else:
            print(f"Skipping {file}: Not enough columns to modify.")
    
    except Exception as e:
        print(f"Error processing {file}: {e}")

# After this, all CSVs in the folder should have the same structure


Fixed and saved US1CAAM0003.csv
Fixed and saved US1CACL0001.csv
Fixed and saved US1CADN0012.csv
Fixed and saved US1CAFR0033.csv
Fixed and saved US1CAHM0029.csv
Fixed and saved US1CAHM0144.csv
Fixed and saved US1CALK0018.csv
Fixed and saved US1CAMD0033.csv
Fixed and saved US1CAMR0002.csv
Fixed and saved US1CAMR0011.csv
Fixed and saved US1CASC0006.csv
Fixed and saved US1CASD0026.csv
Fixed and saved US1CASK0016.csv
Fixed and saved US1CASL0040.csv
Fixed and saved US1CASU0005.csv
Fixed and saved US1CASZ0043.csv
Fixed and saved US1CAVT0017.csv
Fixed and saved US1CAVT0031.csv
Fixed and saved USC00040134.csv
Fixed and saved USC00040161.csv
Fixed and saved USC00040204.csv
Fixed and saved USC00040332.csv
Fixed and saved USC00040543.csv
Fixed and saved USC00040798.csv
Fixed and saved USC00041018.csv
Fixed and saved USC00041075.csv
Fixed and saved USC00041784.csv
Fixed and saved USC00041799.csv
Fixed and saved USC00041805.csv
Fixed and saved USC00041906.csv
Fixed and saved USC00042027.csv
Fixed an

combine all the weather dataset

In [8]:
import os
import pandas as pd

# Define the folder where your CSV files are stored
data_folder = "weathercsv_converted"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith(".csv")]

# Define the expected columns
expected_columns = ['id', 'date', 'obs', 'obs_value']

# Initialize an empty list to store DataFrames
dataframes = []

# Iterate through all CSV files and read them
for file in csv_files:
    file_path = os.path.join(data_folder, file)
    
    try:
        # Read the CSV file
        df = pd.read_csv(file_path, low_memory=False)
        
        # Drop unnamed columns (extra columns)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed', na=False)]
        
        # Ensure it has at least 4 columns
        if len(df.columns) >= 4:
            df = df.iloc[:, :4]  # Keep only the first four columns
            df.columns = expected_columns  # Rename the columns
            
            # Append the DataFrame to the list
            dataframes.append(df)
            print(f"Added {file} to combined dataset.")
        else:
            print(f"Skipping {file}: Not enough columns.")
    
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Combine all DataFrames into one
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("All CSV files successfully combined into a single DataFrame.")
else:
    combined_df = pd.DataFrame(columns=expected_columns)  # Return an empty DataFrame if no valid files
    print("No valid CSV files found to combine.")

# Display the DataFrame (optional)
print(combined_df.head())  # Show the first few rows

# The variable `combined_df` now holds the full dataset


Added US1CAAM0003.csv to combined dataset.
Added US1CACL0001.csv to combined dataset.
Added US1CADN0012.csv to combined dataset.
Added US1CAFR0033.csv to combined dataset.
Added US1CAHM0029.csv to combined dataset.
Added US1CAHM0144.csv to combined dataset.
Added US1CALK0018.csv to combined dataset.
Added US1CAMD0033.csv to combined dataset.
Added US1CAMR0002.csv to combined dataset.
Added US1CAMR0011.csv to combined dataset.
Added US1CASC0006.csv to combined dataset.
Added US1CASD0026.csv to combined dataset.
Added US1CASK0016.csv to combined dataset.
Added US1CASL0040.csv to combined dataset.
Added US1CASU0005.csv to combined dataset.
Added US1CASZ0043.csv to combined dataset.
Added US1CAVT0017.csv to combined dataset.
Added US1CAVT0031.csv to combined dataset.
Added USC00040134.csv to combined dataset.
Added USC00040161.csv to combined dataset.
Added USC00040204.csv to combined dataset.
Added USC00040332.csv to combined dataset.
Added USC00040543.csv to combined dataset.
Added USC00

pivot weather data

In [14]:
weather_pivoted= combined_df.pivot(values="obs_value",index=["id","date"],columns="obs")

Load the fire dataset

In [2]:
fire_data = pd.read_csv(r'fire_data.csv')
fire_data = fire_data.dropna()

# Drop columns if they exist
columns_to_drop = ['CONT_DATE', '_id']
fire_data = fire_data.drop(columns=[col for col in columns_to_drop if col in fire_data.columns])

In [4]:
#add corresponding weather station to fire data
closest_df= pd.read_csv(r'fire to weather stn conversion.csv')
fire_data["weatherstn"] = fire_data["Fire station name"].map(closest_df.set_index("fire_name")["closest_weather_id"])
fire_data= fire_data.set_index('weatherstn')

allfirestn= fire_data['Fire station name'].unique().tolist()
fire_data['Fire station name'].nunique()


102

In [5]:
#we have 103 stations, but there is only 83 station in the fire station list
#have to redo the list and run all the code again
#good news: only have to change the fire station data

#first, find out what is missing in the 103, so we can simply add them in 

missing_stn = list(set(allfirestn) - set(fire_stations["fire_name"]))
print("Missing IDs:", missing_stn)

#OHH after checking: the missing station does not belong to California
#will simpy drop them from the fire dataset


# Drop rows where 'id' is in criteria_list
df_filtered = fire_data[~fire_data['Fire station name'].isin(missing_stn)]

fire_data=df_filtered

Missing IDs: ['Colorado River District', 'Siskiyou National Forest', 'Mojave - NPS', 'Fort Yuma Agency', 'Carson City District - BLM', 'Golden Gate National Recreation Area - NPS', 'Orange County', 'CA Desert District - BLM', 'Bakersfield District - BLM (retired code)', 'Beale Air Force Base FD', 'Colorado River Agency', 'Lakeview District', 'Central CA District - BLM', 'Fremont National Forest', 'City of Weed Vol. Fire Dept.', 'Sequoia - Kings Canyon NP', 'San Diego CAL FIRE (retired code)', 'San Diego CAL FIRE', 'Northern CA District - BLM']


Reshaping data, filling in missing with NaN

In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
#weather data reshaped: weather_pivoted
#weather unpivoted (apparently its better to leave data unpivoted for ML?):combined_df
#fire data with weather station id: fire_data

weather_dataall= weather_pivoted.reset_index()



NameError: name 'weather_pivoted' is not defined

In [7]:
#try merging with pivoted but separated index weather data
# Convert both 'date' columns to datetime with the correct format
weather_dataall['date'] = pd.to_datetime(weather_dataall['date'], format='%d/%m/%Y')
fire_data['ALARM_DATE'] = pd.to_datetime(fire_data['ALARM_DATE'], format='%d/%m/%Y')

weather_fire_pivotmerge = pd.merge(weather_dataall, fire_data, left_on=['id', 'date'], right_on=['weatherstn', 'ALARM_DATE'], how='left')
# Add a 'fire_occurred' column where 1 indicates fire occurred (fire_area is not NaN) and 0 indicates no fire
weather_fire_pivotmerge['fire_occurred'] = weather_fire_pivotmerge['Shape__Area'].notna().astype(int)

NameError: name 'weather_dataall' is not defined

In [13]:
#try merging with unpivoted index weather data, tall and skinny data
unpivot_weather = pd.read_csv(r'combined_df.csv')
# Convert both 'date' columns to datetime with the correct format
unpivot_weather['date'] = pd.to_datetime(unpivot_weather['date'].astype(str), format='%Y%m%d', errors='coerce')
fire_data['ALARM_DATE'] = pd.to_datetime(fire_data['ALARM_DATE'].astype(str), format='%Y%m%d', errors='coerce')

unpivotweather_fire_merge = pd.merge(unpivot_weather, fire_data, left_on=['id', 'date'], right_on=['weatherstn', 'ALARM_DATE'], how='left')
# Add a 'fire_occurred' column where 1 indicates fire occurred (fire_area is not NaN) and 0 indicates no fire
unpivotweather_fire_merge['fire_occurred'] = unpivotweather_fire_merge['Shape__Area'].notna().astype(int)

start_date = '1984-01-01'
end_date = '2023-12-31'

unpivotweather_fire_merge = unpivotweather_fire_merge[(unpivotweather_fire_merge['date'] >= start_date) & (unpivotweather_fire_merge['date'] <= end_date)]


In [None]:

# Fill NaN values with 0 for the columns from 'GIS_ACRES' to 'Shape__Length' and update the dataframe
#unpivotweather_fire_merge.loc[:, 'GIS_ACRES':'Shape__Length'] = unpivotweather_fire_merge.loc[:, 'GIS_ACRES':'Shape__Length'].fillna(0)


# Now the NaN values in the selected columns are filled with 0 in the original dataframe


YAYY machine learning time

In [None]:
#regression model: using fire area
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import category_encoders as ce

# Assuming 'obs' is the categorical column and 'Shape__Area' is your target column
encoder = ce.TargetEncoder(cols=['obs'])  # Target encoding the 'obs' column
unpivotweather_fire_merge_encoded = encoder.fit_transform(unpivotweather_fire_merge[['obs']], unpivotweather_fire_merge['Shape__Area'])

# Now df_encoded will have the target-encoded 'obs' column

##remember: when slicing, []for the column index,[] for the .loc
# Now unpivotweather_fire_merge_encoded contains the target-encoded 'obs' column
# Use 'obs_value' for features and the target 'Shape__Area'
X = unpivotweather_fire_merge_encoded.join(unpivotweather_fire_merge['obs_value'])  # Join 'obs_value' with encoded 'obs'
Y = unpivotweather_fire_merge['Shape__Area']  # Target variable

#training and testing datasets split
from sklearn.model_selection import train_test_split
# Splitting the dataset into training and testing set (80/20)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
# Initialize the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42, oob_score=True)

#fitting the model to the data
# Train the model
model.fit(X, Y)

# You can now use the regressor to make predictions, etc.


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  return x.astype(dtype, copy=copy, casting=casting)


2nd attempt: i am gonna split the data into 2 version
1 version for randomforestregressor (only the fire_area data, taking out dates without fire)
1 version for randomforestclassifier (everything, looking at the binary column, yes/no fire)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

regression_ver = unpivotweather_fire_merge.drop([''])

In [109]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Ensure 'obs' is treated as categorical
unpivotweather_fire_merge['obs'] = unpivotweather_fire_merge['obs'].astype(str)

# One-Hot Encoding for 'obs'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(unpivotweather_fire_merge[['obs']])  

# Convert the encoded array into a DataFrame
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['obs']), index=unpivotweather_fire_merge.index)

# Merge the encoded features with 'obs_value'
X = X_encoded_df.join(unpivotweather_fire_merge[['obs_value']])

# Define the target variable
Y = unpivotweather_fire_merge['Shape__Area']

# Split the dataset into training (80%) and testing (20%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the RandomForestRegressor with out-of-bag score enabled
model = RandomForestRegressor(n_estimators=50, random_state=42, oob_score=True)


In [110]:

# Train the model using only the training data
model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)

# Evaluate model performance
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"OOB Score: {model.oob_score_}")


  return x.astype(dtype, copy=copy, casting=casting)


Mean Squared Error: 297114181565088.9
R-squared: -0.001795301039899888
OOB Score: -0.0018949862561954411


In [None]:
#checking if there are different categories
print(unpivotweather_fire_merge['obs'].nunique())  # Count unique categories
print(unpivotweather_fire_merge['obs'].unique())   # List unique categories
#checking 'obs' type
unpivotweather_fire_merge['obs'] = unpivotweather_fire_merge['obs'].astype('category')
print(unpivotweather_fire_merge['obs'].dtype)  # Should show 'category'
print(unpivotweather_fire_merge[['obs', 'obs_value']].head(10))

print(unpivotweather_fire_merge.dtypes)
unpivotweather_fire_merge['obs_value'] = pd.to_numeric(unpivotweather_fire_merge['obs_value'], errors='coerce')



55
[18, 22, 23, 34, 33, ..., 20, 21, 54, 46, 53]
Length: 55
Categories (55, int64): [0, 1, 2, 3, ..., 51, 52, 53, 54]
category
  obs  obs_value
0  18        102
1  18        546
2  18       1295
3  18          0
4  18          0
5  18          0
6  18          0
7  18         36
8  18         33
9  22          0
id                           object
date                 datetime64[ns]
obs                        category
obs_value                     int64
ALARM_DATE           datetime64[ns]
Fire station name            object
UNIT_ID                      object
GIS_ACRES                   float64
Shape__Area                 float64
Shape__Length               float64
YEAR_                       float64
fire_occurred                 int32
dtype: object
            obs
0  95148.895327
1  95148.895327
2  95148.895327
3  95148.895327
4  95148.895327


In [96]:
# Predicting the target values of the test set
Y_pred = model.predict(X_train)
Y_pred

from sklearn.metrics import mean_squared_error, r2_score

# Calculate the Mean Squared Error (MSE) and R-squared (R²) score
mse = mean_squared_error(Y_train, Y_pred)
r2 = r2_score(Y_train, Y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 279526900178269.53
R-squared: 0.0018132221127208359


In [None]:
#Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Set up hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Print best parameters
print("Best parameters found: ", grid_search.best_params_)
