# Data Processing

## Extract Data from NSI Dataset
NSI has data for the whole state of New Jersey but we only wany Atlantic County

In [5]:
import sqlite3
import pandas as pd


# Connect to the NSI SQLite3 database
conn = sqlite3.connect('../nsi_2022_34.gpkg/nsi_2022_34.gpkg')

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Execute an SQL query to fetch data from a bounding box around Atlantic County (with some excess)
cursor.execute(f"SELECT * FROM nsi WHERE x > -76 AND x < -70 AND y > 28 AND y < 40")

# Fetch all the rows from the result set
rows = cursor.fetchall()

# Convert to a Pandas DataFrame
column_names = [desc[0] for desc in cursor.description]
nsi_data = pd.DataFrame(rows, columns=column_names)

# Renaming x and y to Longitude and Latitude respectively
nsi_data["Latitude"] = nsi_data.y
nsi_data["Longitude"] = nsi_data.x
nsi_data.drop(columns=["x", "y"], inplace=True)

In [7]:
import geopandas

# Load in New Jersey Atlantic County border data.
geodf = geopandas.read_file("County_Boundaries_of_NJ.geojson")
atlantic_county = geodf[geodf["COUNTY"] == "ATLANTIC"]

nsi_gp = geopandas.GeoDataFrame(
    nsi_data, geometry=geopandas.points_from_xy(nsi_data.Longitude, nsi_data.Latitude), crs="EPSG:4326"
)

# Select points inside atlantic county and only the columns from nsi
atlantic_county_nsi = geopandas.sjoin(atlantic_county, nsi_gp, predicate='contains')
to_file = atlantic_county_nsi[nsi_data.columns]

# Output to file
df = pd.DataFrame(to_file)
df.to_csv("nsi_table.csv")

## Getting MOD-IV Parcel Centroids

### Removing parcels outside Atlantic County
The file is too big to work with as JSON, so we edit it as a text file to get rid of unwanted rows

In [None]:
import geopandas
import pandas as pd

In [None]:
atlantic_county_rows = []
with open("parcels and mod4.geojson", "r") as f:
    for line in f.readlines():
        if '"COUNTY": "ATLANTIC"' in line:
            atlantic_county_rows.append(line)
with open("parcels and mod4 atlantic county.geojson", "w") as f:
    for row in atlantic_county_rows:
        f.write(row)
    f.write("{}\n]\n}")

In [None]:
parcels_mod4 = geopandas.read_file("parcels and mod4 atlantic county.geojson")

centroids = parcels_mod4["geometry"].apply(lambda geom: geom.centroid)
lons = centroids.apply(lambda coord: coord.x)
lats = centroids.apply(lambda coord: coord.y)
parcels_mod4["Latitude"] = lats
parcels_mod4["Longitude"] = lons

df = pd.DataFrame()
for column in centroids.drop(columns=["geometry"]).columns:
    df[column] = centroids[column]
df.to_csv("parcels and mod4 atlantic county.csv")