In [None]:
# append path to find utils module in urbansim
import os 
import sys
cwd = os.getcwd() 
parentdir =  os.path.abspath(os.path.join(cwd, os.pardir))
sys.path.append(parentdir) # to get path to utils module

In [None]:
from sqlalchemy import create_engine
from database import get_connection_string
import numpy as np
import pandas as pd

In [None]:
# connect to database and get version ids
db_connection_string = get_connection_string('..\data\config.yml', 'mssql_db')
mssql_engine = create_engine(db_connection_string)

In [None]:
# get max run id from urbansim
run_id_sql = '''
SELECT max(run_id)
  FROM [urbansim].[urbansim].[urbansim_lite_output]
'''
run_id_df = pd.read_sql(run_id_sql, mssql_engine)
run_id = int(run_id_df.values)

print("\n   Max run id : {:,}".format(run_id))

In [None]:
sched_dev_sql = ''' SELECT site_id, year_simulation,
      sum([unit_change]) as unit_change
  FROM [urbansim].[urbansim].[urbansim_lite_output] o
  JOIN [urbansim].[scheduled_development_parcel] p on p.parcel_id = o.parcel_id
  where run_id =  %s  and capacity_type='sch' --and year_simulation < 2025
  GROUP by site_id,year_simulation
  ORDER BY site_id,year_simulation''' 
sched_dev_sql = sched_dev_sql % run_id
s = pd.read_sql(sched_dev_sql,mssql_engine)

In [None]:
s.head()

In [None]:
df4 = pd.pivot_table(s,index=['site_id'],columns='year_simulation', values='unit_change').reset_index()# .rename_axis(None, axis=1)

In [None]:
# df4.to_csv('test20.csv')

# get sched dev parcels

In [None]:
sched_dev_sql = '''
SELECT [site_id]
      ,[parcel_id]
      ,[capacity_3]
  FROM [urbansim].[urbansim].[scheduled_development_parcel]'''
sched_df = pd.read_sql(sched_dev_sql, mssql_engine)
print("\n Sched Dev Units : {}".format(sched_df.capacity_3.sum()))

#### get output of simulation from database

In [None]:
# run_id = 263

In [None]:
# get max run id from urbansim
run_id_sql = '''
SELECT max(run_id)
  FROM [urbansim].[urbansim].[urbansim_lite_output]
'''
run_id_df = pd.read_sql(run_id_sql, mssql_engine)
run_id = int(run_id_df.values)

print("\n   Max run id : {:,}".format(run_id))

## Urbansim Lite Output

##### get unit change by site id by year simulation

##### from [urbansim].[urbansim].[urbansim_lite_output]

In [None]:
uo_sql = '''
SELECT site_id,jur_id,cpa_id,sum(unit_change) as units_added_by_yr,year_simulation
  FROM [urbansim].[urbansim].[urbansim_lite_output] o
  JOIN [isam].[xpef04].[parcel2015_mgra_jur_cpa] cp
    ON cp.parcel_id = o.parcel_id
  JOIN [urbansim].[scheduled_development_parcel] p
    ON p.parcel_id = o.parcel_id
 WHERE run_id = %s and capacity_type = 'sch' and i=1
 GROUP BY site_id,jur_id,cpa_id,year_simulation
 ORDER BY site_id,year_simulation'''
uo_sql = uo_sql % run_id
uo = pd.read_sql(uo_sql,mssql_engine)

#### change datatype for jurisdiction, cpa, site_id from float to int

In [None]:
uo.jur_id = uo.jur_id.astype(int)
uo.cpa_id = uo.cpa_id.astype(int)
uo.site_id = uo.site_id.astype(int)

## Sched development parcel (capacities)

#### get capacity for all sites

##### [urbansim].[urbansim].[scheduled_development_parcel]

In [None]:
sites_sql = '''
SELECT site_id,
       sum(capacity_3) as capacity3
FROM [urbansim].[urbansim].[scheduled_development_parcel]
WHERE capacity_3 > 0
GROUP BY site_id'''
sp = pd.read_sql(sites_sql,mssql_engine)

#### change datatype for capacity and site_id from float to int

In [None]:
sp.capacity3 = sp.capacity3.astype(int)
sp.site_id = sp.site_id.astype(int)

## Difference between capacity and urbansim output

#### total number of unique sites by jurisdiction from output

In [None]:
print("\nOutput: Total sched dev sites from urbansim output: {:,}".format(len(uo.drop_duplicates(['jur_id','site_id']))))
total_units_sched_dev = uo.units_added_by_yr.sum()
print("\nOutput: Total units sched dev from urbansim output: {:,}".format(total_units_sched_dev))

In [None]:
print("\nCapacity: Total sched dev sites from sched dev parcel: {:,}".format(len(sp)))
total_cap3_sp = sp.capacity3.sum()
print("\nCapacity: Total units in sched dev parcel: {:,}".format(total_cap3_sp))

In [None]:
print("\nDifference: Sched dev sites: {:,}".format(len(uo.drop_duplicates(['jur_id','site_id']))-len(sp)))
print("\nDifference: Sched dev units: {:,}".format(total_units_sched_dev-total_cap3_sp))

## Fix issue with difference in sched dev sites

#### change site 15002 to San Marcos - was both unincorporated and San Marcos. (caused it to split it into two sites.)

In [None]:
uo.loc[uo.site_id==15002].sort_values(by='jur_id')

In [None]:
uo.loc[uo.site_id==15002, 'jur_id'] = 15
uo.loc[uo.site_id==15002, 'cpa_id'] = 0

In [None]:
print("\nDifference: Sched dev sites: {:,}".format(len(uo.drop_duplicates(['jur_id','site_id']))-len(sp)))
print("\nDifference: Sched dev units: {:,}".format(total_units_sched_dev-total_cap3_sp))

## Sum output units added by year and jurisdiction/cpa

In [None]:
df = pd.DataFrame({'units_by_site': uo.groupby(["site_id","jur_id","cpa_id","year_simulation"])
                                          .units_added_by_yr.sum()}).reset_index()

In [None]:
df.head()

#### calculate total urbansim output units for each site

In [None]:
df2 = pd.DataFrame({'units_by_site': uo.groupby(["site_id","jur_id","cpa_id"])
                                          .units_added_by_yr.sum()}).reset_index()
df2['year_simulation'] = "Total"

In [None]:
df.head()

In [None]:
df2.head()

#### add totals "row" to dataframe with output units by year

In [None]:
df3 = pd.concat([df,df2],sort=True)

In [None]:
df3.loc[df3.site_id==165]

## JOIN capacity 1

In [None]:
cap1_sql = '''
SELECT sum([capacity_1]) as capacity_1, sum(capacity_2) as capacity_2
      ,[site_id]
  FROM [urbansim].[urbansim].[parcel]
  where site_id IS NOT NULL
  GROUP BY site_id'''
cap1 = pd.read_sql(cap1_sql,mssql_engine)

In [None]:
df3.head()

In [None]:
len(df3)

#### pivot urbansim output so year is column and each site id is one row

In [None]:
df3.year_simulation = df3.year_simulation.astype(str)
df4 = pd.pivot_table(df3,index=['site_id','jur_id','cpa_id'],columns='year_simulation', values='units_by_site').\
reset_index().rename_axis(None, axis=1)
df4.fillna(0,inplace=True)
df4[df4.columns] = df4[df4.columns].astype(int)
df4.head()

#### merge capacity with urbansim output units added by year

In [None]:
sp.head()

In [None]:
df5 = pd.merge(df4,sp,on='site_id')

In [None]:
len(df5)

## merge output and capacity with capacity 1

In [None]:
df5.head()

In [None]:
cap1.head()

In [None]:
df6 = pd.merge(df5,cap1,on='site_id')

In [None]:
len(df6)

## check urbansim output Total units and capacity units match

In [None]:
print("\nCapacity: Sched dev units: {:,}".format(df5.capacity3.sum())) 

In [None]:
print("\nUrbansim output: Sched dev units: {:,}".format(df5.Total.sum())) 

In [None]:
# df5.head()

## Get sitename from scheduled_development_site

#### get site info

In [None]:
site_sql = '''
SELECT [siteid] as site_id
      ,[sitename]
      ,[startdate]
      ,[compdate]
      ,[city]
  FROM [urbansim].[ref].[scheduled_development_site]'''
site_df = pd.read_sql(site_sql, mssql_engine)
ids = site_df["site_id"]
print("\n Duplicated sites : ")
site_df[site_df.site_id.isin(ids[site_df.site_id.duplicated()])]
site_df_unique = site_df.drop_duplicates(subset='site_id', keep="last")
sched_site_df = pd.merge(sched_df,site_df_unique,on='site_id')

In [None]:
ref_sql = '''
SELECT [siteid] as site_id
      ,[sitename]
      ,[sfu]
      ,[mfu]
      ,[mhu]
      ,[source]
      ,[infodate]
      ,[startdate]
      ,[compdate]
        ,[created_us]
      ,[created_da]
      ,[last_edite]
      ,[last_edi_1]
      ,[devtypeid]
      ,[city]
      ,[old_siteid]
      ,[check_]
      ,[status]
  FROM [urbansim].[ref].[scheduled_development_site]
  WHERE (sfu>0 or mfu>0 or mhu>0)
  '''
ref = pd.read_sql(ref_sql,mssql_engine)

In [None]:
ref.loc[ref.site_id == 19002]

In [None]:
ids = ref["site_id"]
print("\n Duplicated sites : ")
ref[ref.site_id.isin(ids[ref.site_id.duplicated()])]


In [None]:
ref = ref.drop_duplicates(subset='site_id', keep="last")
#sched_site_df = pd.merge(sched_df,site_df_unique,on='site_id')

#### merge site ref info with units added

In [None]:
sites = pd.merge(ref,df5,on='site_id',how='outer')

In [None]:
print("\nTotal sched dev sites from ref sched dev: {:,}".format(len(sites)))

## check differences between ref.sched dev site and capacity

In [None]:
no_cap3 = sites.loc[sites.capacity3.isnull()]
cap3 = sites.loc[sites.capacity3>0]
print("\nTotal sites from ref sched dev with no capacity3: {:,}".format(len(no_cap3)))
print("\nTotal sites from ref sched dev with capacity3: {:,}".format(len(cap3)))
print("\n       Expected sites from sched dev : {:,}".format(len(sp)))
print("\nTotal units in dataframe: {:,}".format(sites.capacity3.sum()))
print("\n       Expected units from sched dev: {:,}".format(total_cap3_sp))

#### consolidate duplicate site into one site for 15035

## Merge sitenames (ref) with capacity and output of urbansim

In [None]:
sites = pd.merge(ref,df6,on='site_id',how='outer')

## re-check differences between ref.sched dev site and capacity

In [None]:
print("\nTotal sched dev sites from ref sched dev: {:,}".format(len(ref)))

In [None]:
no_cap3 = sites.loc[sites.capacity3.isnull()]
cap3 = sites.loc[sites.capacity3>0]
print("\nTotal sites from ref sched dev with no capacity3: {:,}".format(len(no_cap3)))
print("\nTotal sites from ref sched dev with capacity3: {:,}".format(len(cap3)))
print("\n       Expected sites from sched dev : {:,}".format(len(sp)))
print("\nTotal units in dataframe: {:,}".format(sites.capacity3.sum()))
print("\n       Expected units from sched dev: {:,}".format(total_cap3_sp))

## Add cpa names to dataframe

In [None]:
#update to jcpa view
cocpa_names_sql = '''
    SELECT zone as cocpa_id, name as cocpa
    FROM data_cafe.ref.geography_zone WHERE geography_type_id = 20'''
cocpa_names = pd.read_sql(cocpa_names_sql, mssql_engine)
cicpa_names_sql = '''
    SELECT zone as cicpa_id, name as cicpa
    FROM data_cafe.ref.geography_zone WHERE geography_type_id = 15'''
cicpa_names = pd.read_sql(cicpa_names_sql, mssql_engine)

In [None]:
sites = pd.merge(sites,cocpa_names,left_on='cpa_id',right_on='cocpa_id',how='left')

In [None]:
sites = pd.merge(sites,cicpa_names,left_on='cpa_id',right_on='cicpa_id',how='left')

In [None]:
sites['cityorcpa'] = sites.cocpa.combine_first(sites.cicpa)
sites['cityorcpa'] = sites.cityorcpa.combine_first(sites.city)

In [None]:
sites.drop(['cocpa_id','cocpa', 'cicpa_id', 'cicpa'], axis=1,inplace=True)

In [None]:
#sites.columns[2:6]

#### change datatype from float to int for years and units

In [None]:
sites.fillna(0,inplace=True)
#yr_columns = sites.columns[9:47]
#mfu_columns = sites.columns[2:6]
#sites[yr_columns] = sites[yr_columns].astype(int)
#sites[mfu_columns] = sites[mfu_columns].astype(int)

sites.loc[sites.startdate==0, 'startdate'] = np.nan
sites.loc[sites.compdate==0, 'compdate'] = np.nan

## write to csv

In [None]:
run_id

In [None]:
fileout = 'sched_dev_sites_run_' + str(run_id) + '.csv'
print(fileout)

In [None]:
sites.to_csv(fileout)


In [None]:
yrs_per_site = pd.DataFrame({'count_yrs': df3.groupby('site_id').year_simulation.count()}).reset_index()

In [None]:
csv_name = 'years_per_site_run_{}.csv'.format(run_id)

In [None]:
yrs_per_site.to_csv(csv_name)