In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import DistanceMetric  
from datetime import datetime, timedelta

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
#Deal with SettingCopyWarnings
pd.options.mode.chained_assignment = None  # default='warn'

## Step 1: Create a dataset of unique fire- nearest weather station pairs

### Preparing a dataframe containing unique weather station coordinates

In [4]:
#Reading the file which has coordinates of all the weather stations across the 15 "climate daily" csv files
ctc1 = pd.read_csv("climatetotalcoord1.csv")

In [5]:
#limiting decimal digits to 4
ctc1 = round(ctc1, 4)

In [6]:
#We are creating a new column which merges latitude and longitude to form a new combined column called coordinates. 
#But since the "+" operator only works on strings, we are also converting the lat and long into  strings using astype(). The "," acts as a separator. Beware of NaNs when doing this
ctc1["coordinates"] = ctc1["latitude"].astype(str) + "," + ctc1["longitude"].astype(str)

In [7]:
#Finding the unique values in the coordinate column
ctc2 = ctc1["coordinates"].unique()

In [8]:
ctc3 = pd.DataFrame(ctc2, columns = ["climcoordinates"])

In [9]:
#Splitting the "coordinates1" column to two separate columns (climlatitude & climlongitude) on the basis of the comma delimitor
ctc3[["climlatitude", "climlongitude"]] = ctc3.climcoordinates.str.split(",", expand = True)

In [10]:
#Converting climlatitude and climlongitude into float
ctc3["climlatitude"] = ctc3["climlatitude"].astype(float)
ctc3["climlongitude"] = ctc3["climlongitude"].astype(float)

### Preparing a wildfire dataset

In [11]:
wild2 = pd.read_csv("canadawildfiresupdated1_2011to2021.csv")

In [12]:
#We are creating a new column which merges latitude and longitude to form a new combined column called coordinates. 
wild2["coordinates"] = wild2["latitude"].astype(str) + "," + wild2["longitude"].astype(str)

In [13]:
#Limiting decimal points to 4
wild2 = round(wild2, 4)

In [14]:
#Making a dataframe for only the Quebec region
mask = wild2["src_agency"] == "QC"
fire2_qc = wild2[mask]

### Calculating and adding a column of haversine distance

In [15]:
#Crossjoining quebec dataset
cj5 = fire2_qc.merge(ctc3, how = "cross")

In [16]:
#Converting fire and weather(climate) station data of quebec to radians for applying haversine formula
cj5[["firelat_radians","firelong_radians"]] = np.radians(cj5.loc[:,["latitude", "longitude"]])
cj5[["climlat_radians", "climlong_radians"]] = np.radians(cj5.loc[:,["climlatitude", "climlongitude"]])

In [17]:
#Defining the haversine formula
def haversine_distance(lon1, lat1, lon2, lat2):
    newlat = lat2 - lat1
    newlon = lon2 - lon1

    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    # use appropriate value for radius of the earth (this is crude!)
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [18]:
#To compute distances by applying the formula
cj5["distance_km"] = haversine_distance(cj5["firelong_radians"], cj5["firelat_radians"], cj5["climlong_radians"], 
                                                  cj5["climlat_radians"])

### Creating the dataset of unique fire - nearest weather station pairs

In [19]:
#We group by the entire dataset on the basis of fire identity (fid) and then select the rows with 
#minimum value for distance from weather station
cj6qc = cj5.groupby(["fid"])["distance_km"].min()

In [20]:
#Converting cj6qc, which is a series object, into a dataframe
cj6qc = pd.DataFrame(cj6qc, columns = ["distance_km"])

In [21]:
#Now make index of cj6qc as column "fid" and change index
cj6qc['fid'] = cj6qc.index
cj6qc.index = range(cj6qc.shape[0])

In [22]:
#Now merging cj5 and cj6qc on both columns "fid" and "distance_km"
cj7qc = pd.merge(cj5, cj6qc, on = ["fid", "distance_km"])

In [None]:
cj7qc.head(2)