# 10-process-data
> Importing, cleaning, testing, and saving data

This series of notebooks reflects operations to scrape, prepare, validate, and save the data.

#### Helpful packages and preliminaries

In [3]:
#data access and processing
import numpy as np
from osgeo import gdal
import matplotlib.pyplot as plt


In [4]:
raster1 = "/Users/namjukim/Desktop/DSI/VDS Team Data Share/CONUS2001_ClipAOI.tif"
raster2 = "/Users/namjukim/Desktop/DSI/VDS Team Data Share/CONUS2004_ClipAOI.tif"

In [5]:
ds1 = gdal.Open(raster1)
ds2 = gdal.Open(raster2)

In [6]:

print("Projection: ", ds1.GetProjection())  # get projection
print("Columns:", ds1.RasterXSize)  # number of columns
print("Rows:", ds1.RasterYSize)  # number of rows
print("Band count:", ds1.RasterCount)  # number of bands

Projection:  PROJCS["Albers_Conical_Equal_Area",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.257222101004,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4269"]],PROJECTION["Albers_Conic_Equal_Area"],PARAMETER["latitude_of_center",23],PARAMETER["longitude_of_center",-96],PARAMETER["standard_parallel_1",29.5],PARAMETER["standard_parallel_2",45.5],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]
Columns: 35165
Rows: 17975
Band count: 1


In [11]:
# Read Raster Data

data_array1 = ds1.GetRasterBand(1).ReadAsArray()
print(data_array1.shape)

data_array2 = ds2.GetRasterBand(1).ReadAsArray()
print(data_array2.shape)

diff = data_array2 - data_array1 
print(diff.shape)


(17975, 35165)
(17975, 35165)
(17975, 35165)


In [12]:
#Have to Subset

# Our data is too big to visualize all at once.
# Create a subset by changing the indices below.

n1 = 8000
n2 = 11000

subset_2001 = data_array1[n1:n2,n1:n2]
subset_2004 = data_array2[n1:n2,n1:n2]
subset_diff = diff[n1:n2,n1:n2]
subset_2001.shape # 3000 x 3000 seems to work. You can try bigger.

(3000, 3000)

In [9]:
plt.figure(figsize = (10,10))
#plt.imshow(data_array1)
#plt.imshow(data_array2)
plt.imshow(diff)
plt.colorbar

<function matplotlib.pyplot.colorbar(mappable=None, cax=None, ax=None, **kwargs)>

In [13]:
#visualizing the data using pandas

import pandas as pd


In [14]:
data_array1_t = pd.DataFrame(data_array1)
data_array2_t = pd.DataFrame(data_array2)
data_diff_t = pd.DataFrame(diff)

In [15]:
data_array1_t.iloc[8000:8010,8000:8010]

Unnamed: 0,8000,8001,8002,8003,8004,8005,8006,8007,8008,8009
8000,82,21,82,82,82,82,82,82,82,82
8001,82,21,82,82,82,21,21,82,82,82
8002,82,21,21,21,21,82,82,82,82,82
8003,82,21,82,21,82,82,82,82,82,82
8004,21,21,82,82,82,82,82,82,82,82
8005,21,82,82,82,82,82,82,82,82,82
8006,82,82,82,41,41,41,41,82,82,82
8007,41,41,41,41,41,41,41,41,90,90
8008,41,41,41,41,41,41,41,41,90,41
8009,41,41,41,41,41,81,81,41,41,90
