# Processing CONUS data from all years

In [1]:
import pandas as pd
import numpy as np
import rasterio

## 1. Read in data from all years

First, we will store the land cover value for each pixel in the raster in a dataframe, with each row being a pixel and the pixel's land cover value in different years in columns

In [2]:
# Dataframe to hold the data from all years
conus_data = pd.DataFrame()

# Years of the data
years = ["2001", "2004", "2006", "2008", "2011", "2013", "2016", "2019"]

# Flatten the raster data for each year into a 1darray and store it as a column in the dataframe
for year in years:
    with rasterio.open(f"./data/CONUS{year}_ClipAOI.tif", "r") as ds:
        data = ds.read(1)
        conus_data[year] = data.ravel()

Next, we need to store the x and y coordinates of each pixel into the dataframe

In [8]:
with rasterio.open("./data/CONUS2019_ClipAOI.tif", "r") as ds:
    data = ds.read(1)
    conus_data["2019"] = data.ravel()

    height = data.shape[0]
    width = data.shape[1]

    # Create a 2d array for X and Y indices
    cols, rows = np.meshgrid(np.arange(width), np.arange(height))

    # The following code gets the x and y coordinates of each pixel instead of the x y indices, which may be useful afterwards
    # xs, ys = rasterio.transform.xy(ds.transform, rows, cols)
    # xcoords = np.array(xs)
    # ycoords = np.array(ys)

rows, cols

(array([[    0,     0,     0, ...,     0,     0,     0],
        [    1,     1,     1, ...,     1,     1,     1],
        [    2,     2,     2, ...,     2,     2,     2],
        ...,
        [17972, 17972, 17972, ..., 17972, 17972, 17972],
        [17973, 17973, 17973, ..., 17973, 17973, 17973],
        [17974, 17974, 17974, ..., 17974, 17974, 17974]]),
 array([[    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164],
        ...,
        [    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164],
        [    0,     1,     2, ..., 35162, 35163, 35164]]))

In [6]:
# Flatten the indices 2d arrays and add them as columns in the dataframe
conus_data["X_index"] = rows.ravel()
conus_data["Y_index"] = cols.ravel()



conus_data.head()

Unnamed: 0,2001,2004,2006,2008,2011,2013,2016,2019,X_index,Y_index
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,4


In [10]:
# Test if indices match the pixels correctly
conus_data.sample(5)

Unnamed: 0,2001,2004,2006,2008,2011,2013,2016,2019,X_index,Y_index
158198349,43,41,43,41,43,41,41,41,4498,26179
583688635,0,0,0,0,0,0,0,0,16598,19965
103173727,0,0,0,0,0,0,0,0,2933,34782
252533028,41,81,41,81,41,81,41,81,7181,13163
370713741,82,82,82,82,82,82,82,82,10542,4311


### Next steps

1. Drop rows with value 0, having problems because dataset is too large
2. Feature engineering