# Processing CONUS data from all years

In [2]:
import pandas as pd
import numpy as np
import rasterio

## 1. Read in data from all years

First, we will store the land cover value for each pixel in the raster in a dataframe, with each row being a pixel and the pixel's land cover value in different years in columns

In [3]:
# Dataframe to hold the data from all years
conus_data = pd.DataFrame()

# Years of the data
years = ["2001", "2004", "2006", "2008", "2011", "2013", "2016", "2019", "2021"]

# Flatten the raster data for each year into a 1darray and store it as a column in the dataframe
for year in years:
    with rasterio.open(f"./data/CONUS{year}_ClipAOI.tif", "r") as ds:
        data = ds.read(1)
        conus_data[year] = data.ravel()

Next, we need to store the x and y coordinates of each pixel into the dataframe

In [4]:
with rasterio.open("./data/CONUS2019_ClipAOI.tif", "r") as ds:
    data = ds.read(1)
    conus_data["2019"] = data.ravel()

    height = data.shape[0]
    width = data.shape[1]

    # Create a 2d array for X and Y indices
    cols, rows = np.meshgrid(np.arange(width), np.arange(height))

    # The following code gets the x and y coordinates of each pixel instead of the x y indices, which may be useful afterwards
    # xs, ys = rasterio.transform.xy(ds.transform, rows, cols)
    # xcoords = np.array(xs)
    # ycoords = np.array(ys)

rows, cols

(array([[    0,     0,     0, ...,     0,     0,     0],
        [    1,     1,     1, ...,     1,     1,     1],
        [    2,     2,     2, ...,     2,     2,     2],
        ...,
        [17972, 17972, 17972, ..., 17972, 17972, 17972],
        [17973, 17973, 17973, ..., 17973, 17973, 17973],
        [17974, 17974, 17974, ..., 17974, 17974, 17974]]),
 array([[    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164],
        ...,
        [    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164]]))

In [5]:
# Flatten the indices 2d arrays and add them as columns in the dataframe
conus_data["X_index"] = rows.ravel()
conus_data["Y_index"] = cols.ravel()



conus_data.head()

Unnamed: 0,2001,2004,2006,2008,2011,2013,2016,2019,2021,X_index,Y_index
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,4


In [6]:
# Test if indices match the pixels correctly
conus_data.sample(5)

Unnamed: 0,2001,2004,2006,2008,2011,2013,2016,2019,2021,X_index,Y_index
136966663,0,0,0,0,0,0,0,0,0,3894,34153
363606592,0,0,0,0,0,0,0,0,0,10340,492
456396691,41,41,41,41,41,41,41,41,41,12978,25321
542312662,0,0,0,0,0,0,0,0,0,15421,33197
92856814,41,41,41,41,41,41,43,41,41,2640,21214


Then, we will drop the pixels with no data (pixels that have 0 as value)

In [7]:
# Use a boolean mask to filter out the rows where the land cover value of all years is 0
conus_data = conus_data[(conus_data[years] != 0).all(axis=1)]
conus_data.head()

Unnamed: 0,2001,2004,2006,2008,2011,2013,2016,2019,2021,X_index,Y_index
14834446,41,41,41,41,41,41,41,41,41,421,29981
14834447,41,41,41,41,41,41,41,41,41,421,29982
14834448,41,41,41,41,41,41,41,41,41,421,29983
14834449,41,41,41,41,41,41,41,41,41,421,29984
14834450,41,41,41,41,41,41,41,41,41,421,29985


In [8]:
conus_data.shape

(411365881, 11)

### Next steps

1. Feature engineering
    - Distance away from developed land
    - Distance away from water
    - Etc...