In [None]:
!pipenv install pandas
!pipenv install json
!pipenv install Pyarrow 
!pipenv install matplotlib
!pipenv install python-dotenv

In [None]:
import requests
import pandas as pd
import os
import pandas as pd
import json as json
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
from dotenv import load_dotenv

In [None]:
data_dir = 'data'
data = []
for file in [file for file in os.listdir(data_dir) if file.endswith('.json') and file != 'taxids.json']:
    file_path = os.path.join(data_dir, file)
    with open(file_path, 'r') as f:
        data.append(json.load(f))
df = pd.DataFrame(data)
df.to_pickle('data/staff.pkl')
del data

In [None]:
counts = df.groupby(by="Command").size( )
for index, value in counts.items():
    print(f"Index : {index}, Value : {value}") if index.startswith('077') or 'TRAF' in index or "TRANSP" in index  else None


In [None]:
# read from pickle if it exists and is less than 1 week old
pct = '077'
pklFile =  f'data/{pct}.pkl'

if (os.path.exists(pklFile) and datetime.now().timestamp() - os.stat(pklFile).st_mtime < 7*24*60*60):
    vdf = pd.read_pickle(pklFile)
else:
    os.remove(pklFile) if os.path.exists(pklFile) else None
    load_dotenv('env.local')
    app_token = os.environ['APP_TOKEN']
    limit = 5000
    offset = 0
    cols = ["license_type","summons_number","issue_date","violation_time","violation","precinct","county","issuing_agency","violation_status"]
    data = []
    success = False
    while True:
        url = f"https://data.cityofnewyork.us/resource/nc67-uf89.json?precinct={pct}&$$app_token={app_token}&$limit={limit}&$offset={offset}&$order=summons_number"
        response = requests.get(url)
        if response.status_code == 200:
            data.extend(response.json())
            if len(response.json()) < limit:
                success = True
                break
            else:
                offset += limit
        else:
            print(f"Failed to fetch data. HTTP Status Code: {response.status_code}")
            break
    if success:
        vdf = pd.DataFrame(data)
        vdf.to_pickle(pklFile)
        del data
vdf['idate'] = pd.to_datetime(vdf['issue_date'], errors='coerce')
vdf['Year'] = vdf['idate'].dt.year.astype('Int64')
#"issue_date",
ssvdf = vdf[["license_type","summons_number","idate", "Year","violation_time","violation","precinct","county","issuing_agency","violation_status"]]
ssvdf.head()

In [None]:
# display counts of violation, year
ssvdf = ssvdf[(ssvdf.Year > 2014) & (ssvdf.Year < 2024) &(ssvdf.violation != "NO PARKING-STREET CLEANING") & (ssvdf.issuing_agency.isin(['DEPARTMENT OF SANITATION', 'POLICE DEPARTMENT']))]
## display counts of violation by Year and issuing_agency
counts = ssvdf.groupby(by=["Year","issuing_agency" ]).size().sort_values()
plot = counts.unstack().plot(kind='bar', 
                             figsize=(8, 4), 
                             stacked=True, 
                             title=f"Violation Counts by Year and Issuing Agency (ex street cleaning) - pct {pct}")
plot.annotate(f"Covid Years", xy=(0.6, -0.2), xycoords='axes fraction',  fontsize=11, color='red')
plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.savefig(f'data/{pct}.png', bbox_inches='tight')
# # display counts in a nice table
counts.unstack().fillna(0).astype(int)

In [None]:
ssvdf.violation.value_counts().head(20)