#### DS 41-42 QC Review
#### **Project ID: 2022-01** 
**Author:** Purva Singh <br>
**Variables**: Housing Units, No. of households, and Household Size <br>
**Link to the test plan:** https://sandag.sharepoint.com/:o:/g/qaqc/EsHlSrI2leVNpv27KGCpj88B68SeC-GlbOAIQRFve8-SqQ?e=cipqrc 

In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc
import sys

In [7]:
# DS35
path = r'T:\socioec\Current_Projects\XPEF31\abm_csv'
ds35_files = glob.glob(path + "/mgra13_based_input*")
list_ds35 = []
for filename in ds35_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df["year"]=filename[-11:-7]
    list_ds35.append(df)




In [9]:
ds35 = pd.concat(list_ds35, axis=0, ignore_index=True)
#for col in ds35.columns:
 #   print(col)
ds35= ds35[["mgra","year", "hs", "hs_sf", "hs_mf", "hs_mh", "hh", "hh_sf", "hh_mf", "hh_mh", "gq_civ", "gq_mil", "hhs", "pop", "hhp"]]
print(ds35.shape)
ds35.columns += '_ds35'
ds35.head()


(299026, 15)


Unnamed: 0,mgra_ds35,year_ds35,hs_ds35,hs_sf_ds35,hs_mf_ds35,hs_mh_ds35,hh_ds35,hh_sf_ds35,hh_mf_ds35,hh_mh_ds35,gq_civ_ds35,gq_mil_ds35,hhs_ds35,pop_ds35,hhp_ds35
0,1,2016,19,19,0,0,18,18,0,0,0,0,2.278,41,41
1,2,2016,35,35,0,0,34,34,0,0,0,0,2.382,81,81
2,3,2016,52,52,0,0,52,52,0,0,0,0,2.135,111,111
3,4,2016,30,30,0,0,30,30,0,0,0,0,2.433,73,73
4,5,2016,28,28,0,0,28,28,0,0,0,0,2.25,63,63


In [10]:
# DS41
path = r'T:\socioec\Current_Projects\XPEF35\abm_csv\New_mgra_based_input'
ds41_files = glob.glob(path + "/*.csv")
list_ds41 = []
for filename in ds41_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df["year"]=filename[-11:-7]
    list_ds41.append(df)
    
#from functools import reduce
#ds_41= reduce(lambda x, y: pd.merge(x, y, on = 'mgra'), list_ds41)

ds41 = pd.concat(list_ds41, axis=0, ignore_index=True)
#for col in ds41.columns:
 #   print(col)
ds41= ds41[["mgra", "year", "hs", "hs_sf", "hs_mf", "hs_mh", "hh", "hh_sf", "hh_mf", "hh_mh", "gq_civ", "gq_mil", "hhs", "pop", "hhp"]]
print(ds41.shape)
ds41.columns += '_ds41'
ds41.head()



(299026, 15)


Unnamed: 0,mgra_ds41,year_ds41,hs_ds41,hs_sf_ds41,hs_mf_ds41,hs_mh_ds41,hh_ds41,hh_sf_ds41,hh_mf_ds41,hh_mh_ds41,gq_civ_ds41,gq_mil_ds41,hhs_ds41,pop_ds41,hhp_ds41
0,1,2016,19,19,0,0,18,18,0,0,0,0,2.278,41,41
1,2,2016,35,35,0,0,34,34,0,0,0,0,2.382,81,81
2,3,2016,52,52,0,0,52,52,0,0,0,0,2.135,111,111
3,4,2016,30,30,0,0,30,30,0,0,0,0,2.433,73,73
4,5,2016,28,28,0,0,28,28,0,0,0,0,2.25,63,63


In [11]:
ds_final= pd.merge(ds41, ds35,  how='inner', left_on=['mgra_ds41','year_ds41'], right_on = ['mgra_ds35','year_ds35'])
ds_final= ds_final.apply(pd.to_numeric) 

ds_final.dtypes

mgra_ds41        int64
year_ds41        int64
hs_ds41          int64
hs_sf_ds41       int64
hs_mf_ds41       int64
hs_mh_ds41       int64
hh_ds41          int64
hh_sf_ds41       int64
hh_mf_ds41       int64
hh_mh_ds41       int64
gq_civ_ds41      int64
gq_mil_ds41      int64
hhs_ds41       float64
pop_ds41         int64
hhp_ds41         int64
mgra_ds35        int64
year_ds35        int64
hs_ds35          int64
hs_sf_ds35       int64
hs_mf_ds35       int64
hs_mh_ds35       int64
hh_ds35          int64
hh_sf_ds35       int64
hh_mf_ds35       int64
hh_mh_ds35       int64
gq_civ_ds35      int64
gq_mil_ds35      int64
hhs_ds35       float64
pop_ds35         int64
hhp_ds35         int64
dtype: object

In [12]:
# Merging with dim_mgra table 

# loading the dim_mgra_denormalized table from demographic warehouse 
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

dim_mgra ="SELECT mgra, jurisdiction, jurisdiction_id, cpa, cpa_id "\
                  "FROM [demographic_warehouse].[dim].[mgra_denormalize]"\
                   "WHERE series= 14 " 
             
dim_mgra = pd.read_sql_query(dim_mgra, conn)    

dim_mgra

Unnamed: 0,mgra,jurisdiction,jurisdiction_id,cpa,cpa_id
0,1,San Diego,14,Uptown,1442
1,1,San Diego,14,Mission Valley,1419
2,2,San Diego,14,Uptown,1442
3,2,San Diego,14,Mission Valley,1419
4,3,San Diego,14,Uptown,1442
...,...,...,...,...,...
27311,22998,Carlsbad,1,*Not in a CPA*,0
27312,22999,Carlsbad,1,*Not in a CPA*,0
27313,23000,Carlsbad,1,*Not in a CPA*,0
27314,23001,Carlsbad,1,*Not in a CPA*,0


In [14]:
#dim_mgra= dim_mgra.drop_duplicates()
#len((dim_mgra['mgra']))

In [13]:
# Merging the ds file with mgra 

ds_dim_mgra=pd.merge(ds_final, dim_mgra,  how='left', left_on=['mgra_ds41'], right_on = ['mgra'])
ds_dim_mgra

Unnamed: 0,mgra_ds41,year_ds41,hs_ds41,hs_sf_ds41,hs_mf_ds41,hs_mh_ds41,hh_ds41,hh_sf_ds41,hh_mf_ds41,hh_mh_ds41,...,gq_civ_ds35,gq_mil_ds35,hhs_ds35,pop_ds35,hhp_ds35,mgra,jurisdiction,jurisdiction_id,cpa,cpa_id
0,1,2016,19,19,0,0,18,18,0,0,...,0,0,2.278,41,41,1,San Diego,14,Uptown,1442
1,1,2016,19,19,0,0,18,18,0,0,...,0,0,2.278,41,41,1,San Diego,14,Mission Valley,1419
2,2,2016,35,35,0,0,34,34,0,0,...,0,0,2.382,81,81,2,San Diego,14,Uptown,1442
3,2,2016,35,35,0,0,34,34,0,0,...,0,0,2.382,81,81,2,San Diego,14,Mission Valley,1419
4,3,2016,52,52,0,0,52,52,0,0,...,0,0,2.135,111,111,3,San Diego,14,Uptown,1442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355103,22998,2050,92,92,0,0,87,87,0,0,...,0,0,2.437,212,212,22998,Carlsbad,1,*Not in a CPA*,0
355104,22999,2050,0,0,0,0,0,0,0,0,...,0,0,0.000,0,0,22999,Carlsbad,1,*Not in a CPA*,0
355105,23000,2050,131,131,0,0,126,126,0,0,...,0,0,2.429,306,306,23000,Carlsbad,1,*Not in a CPA*,0
355106,23001,2050,85,85,0,0,81,81,0,0,...,0,0,2.500,205,205,23001,Carlsbad,1,*Not in a CPA*,0


In [14]:
# Grouping by Region and year
region= ds_final.groupby(['year_ds41'],as_index=False).sum()

# Grouping by Jurisdiction and year 
juris= ds_dim_mgra. groupby(['jurisdiction_id', 'jurisdiction', 'year_ds41' ],as_index=False).sum()

# Grouping by CPA and year
cpa= ds_dim_mgra. groupby(['cpa','year_ds41', 'jurisdiction'],as_index=False).sum()

juris.dtypes

#df = df.apply(pd.to_numeric) 

jurisdiction_id      int64
jurisdiction        object
year_ds41            int64
mgra_ds41            int64
hs_ds41              int64
hs_sf_ds41           int64
hs_mf_ds41           int64
hs_mh_ds41           int64
hh_ds41              int64
hh_sf_ds41           int64
hh_mf_ds41           int64
hh_mh_ds41           int64
gq_civ_ds41          int64
gq_mil_ds41          int64
hhs_ds41           float64
pop_ds41             int64
hhp_ds41             int64
mgra_ds35            int64
year_ds35            int64
hs_ds35              int64
hs_sf_ds35           int64
hs_mf_ds35           int64
hs_mh_ds35           int64
hh_ds35              int64
hh_sf_ds35           int64
hh_mf_ds35           int64
hh_mh_ds35           int64
gq_civ_ds35          int64
gq_mil_ds35          int64
hhs_ds35           float64
pop_ds35             int64
hhp_ds35             int64
mgra                 int64
cpa_id               int64
dtype: object

In [15]:
#Saving output
writer = pd.ExcelWriter('C:/Users/psi/Desktop/Housing_35_41.xlsx')   
region.to_excel(writer, sheet_name = 'Region', index = False)
juris.to_excel(writer, sheet_name = 'Jurisdiction', index = False)
cpa.to_excel(writer, sheet_name = 'CPA', index = False)
writer.save()
writer.close() 

ds_final.to_csv("C:/Users/psi/Desktop/Housing_35_41_mgra.csv", index=False)

  warn("Calling close() on already closed file.")


In [18]:
# DS38
path = r'T:\socioec\Current_Projects\XPEF33\abm_csv\new_parking'
ds38_files = glob.glob(path + "/mgra13_based_input*")
list_ds38 = []
for filename in ds38_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df["year"]=filename[-14:-10]
    list_ds38.append(df)



In [19]:
ds38 = pd.concat(list_ds38, axis=0, ignore_index=True)
#for col in ds35.columns:
 #   print(col)
ds38= ds38[["mgra","year", "hs", "hs_sf", "hs_mf", "hs_mh", "hh", "hh_sf", "hh_mf", "hh_mh", "gq_civ", "gq_mil", "hhs", "pop", "hhp"]]
#print(ds38.shape)
ds38.columns += '_ds38'
ds38.head()

Unnamed: 0,mgra_ds38,year_ds38,hs_ds38,hs_sf_ds38,hs_mf_ds38,hs_mh_ds38,hh_ds38,hh_sf_ds38,hh_mf_ds38,hh_mh_ds38,gq_civ_ds38,gq_mil_ds38,hhs_ds38,pop_ds38,hhp_ds38
0,1,2016,19,19,0,0,18,18,0,0,0,0,2.278,41,41
1,2,2016,35,35,0,0,34,34,0,0,0,0,2.382,81,81
2,3,2016,52,52,0,0,52,52,0,0,0,0,2.135,111,111
3,4,2016,30,30,0,0,30,30,0,0,0,0,2.433,73,73
4,5,2016,28,28,0,0,28,28,0,0,0,0,2.25,63,63


In [29]:
# DS42
path = r'T:\socioec\Current_Projects\XPEF36\abm_csv\new_parking\New_mgra_based_input'
 
ds42_files = glob.glob(path + "/mgra13_based_input*")
list_ds42 = []
for filename in ds42_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df["year"]=filename[-14:-10]
    list_ds42.append(df)

In [30]:
ds42 = pd.concat(list_ds42, axis=0, ignore_index=True)
#for col in ds35.columns:
 #   print(col)
ds42= ds42[["mgra","year", "hs", "hs_sf", "hs_mf", "hs_mh", "hh", "hh_sf", "hh_mf", "hh_mh", "gq_civ", "gq_mil", "hhs", "pop", "hhp"]]
print(ds42.shape)
ds42.columns += '_ds42'
ds42.head()

(299026, 15)


Unnamed: 0,mgra_ds42,year_ds42,hs_ds42,hs_sf_ds42,hs_mf_ds42,hs_mh_ds42,hh_ds42,hh_sf_ds42,hh_mf_ds42,hh_mh_ds42,gq_civ_ds42,gq_mil_ds42,hhs_ds42,pop_ds42,hhp_ds42
0,1,2016,19,19,0,0,18,18,0,0,0,0,2.278,41,41
1,2,2016,35,35,0,0,34,34,0,0,0,0,2.382,81,81
2,3,2016,52,52,0,0,52,52,0,0,0,0,2.135,111,111
3,4,2016,30,30,0,0,30,30,0,0,0,0,2.433,73,73
4,5,2016,28,28,0,0,28,28,0,0,0,0,2.25,63,63


In [31]:
ds42_final= pd.merge(ds42, ds38,  how='inner', left_on=['mgra_ds42','year_ds42'], right_on = ['mgra_ds38','year_ds38'])
ds42_final= ds42_final.apply(pd.to_numeric) 
ds42_final

Unnamed: 0,mgra_ds42,year_ds42,hs_ds42,hs_sf_ds42,hs_mf_ds42,hs_mh_ds42,hh_ds42,hh_sf_ds42,hh_mf_ds42,hh_mh_ds42,...,hs_mh_ds38,hh_ds38,hh_sf_ds38,hh_mf_ds38,hh_mh_ds38,gq_civ_ds38,gq_mil_ds38,hhs_ds38,pop_ds38,hhp_ds38
0,1,2016,19,19,0,0,18,18,0,0,...,0,18,18,0,0,0,0,2.278,41,41
1,2,2016,35,35,0,0,34,34,0,0,...,0,34,34,0,0,0,0,2.382,81,81
2,3,2016,52,52,0,0,52,52,0,0,...,0,52,52,0,0,0,0,2.135,111,111
3,4,2016,30,30,0,0,30,30,0,0,...,0,30,30,0,0,0,0,2.433,73,73
4,5,2016,28,28,0,0,28,28,0,0,...,0,28,28,0,0,0,0,2.250,63,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299021,22998,2050,90,90,0,0,87,87,0,0,...,0,87,87,0,0,0,0,2.414,210,210
299022,22999,2050,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.000,0,0
299023,23000,2050,131,131,0,0,126,126,0,0,...,0,126,126,0,0,0,0,2.452,309,309
299024,23001,2050,83,83,0,0,81,81,0,0,...,0,81,81,0,0,0,0,2.778,225,225


In [33]:
# Merging the ds42 file with mgra

ds42_dim_mgra=pd.merge(ds42_final, dim_mgra,  how='left', left_on=['mgra_ds42'], right_on = ['mgra'])

#ds42_dim_mgra.head()

In [34]:
region42= ds42_final.groupby(['year_ds42'],as_index=False).sum()


# Grouping by Jurisdiction and year 
juris42= ds42_dim_mgra. groupby(['jurisdiction_id', 'jurisdiction', 'year_ds42' ],as_index=False).sum()

# Grouping by CPA and year
cpa42= ds42_dim_mgra. groupby(['cpa','year_ds42', 'jurisdiction'],as_index=False).sum()

#juris.dtypes

juris42.head()

Unnamed: 0,jurisdiction_id,jurisdiction,year_ds42,mgra_ds42,hs_ds42,hs_sf_ds42,hs_mf_ds42,hs_mh_ds42,hh_ds42,hh_sf_ds42,...,hh_sf_ds38,hh_mf_ds38,hh_mh_ds38,gq_civ_ds38,gq_mil_ds38,hhs_ds38,pop_ds38,hhp_ds38,mgra,cpa_id
0,1,Carlsbad,2016,13591993,50670,34815,14215,1640,47670,32817,...,32817,13351,1502,915,0,1505.738,125491,124576,13591993,0
1,1,Carlsbad,2018,13591993,51060,35398,14020,1642,48297,33554,...,33551,13240,1504,915,0,1527.815,127537,126622,13591993,0
2,1,Carlsbad,2020,13591993,51832,35979,14211,1642,49006,34081,...,34077,13422,1504,915,0,1513.872,128615,127700,13591993,0
3,1,Carlsbad,2023,13591993,52393,36265,14486,1642,49500,34409,...,34410,13585,1504,915,0,1510.472,128733,127818,13591993,0
4,1,Carlsbad,2025,13591993,52393,36265,14486,1642,49576,34442,...,34438,13634,1504,915,0,1503.177,128640,127725,13591993,0


In [35]:
# comparing ds42 with 41 

ds41_42_final= pd.merge(ds42, ds41,  how='inner', left_on=['mgra_ds42','year_ds42'], right_on = ['mgra_ds41','year_ds41'])
ds41_42_final= ds41_42_final.apply(pd.to_numeric) 
ds41_42_final


Unnamed: 0,mgra_ds42,year_ds42,hs_ds42,hs_sf_ds42,hs_mf_ds42,hs_mh_ds42,hh_ds42,hh_sf_ds42,hh_mf_ds42,hh_mh_ds42,...,hs_mh_ds41,hh_ds41,hh_sf_ds41,hh_mf_ds41,hh_mh_ds41,gq_civ_ds41,gq_mil_ds41,hhs_ds41,pop_ds41,hhp_ds41
0,1,2016,19,19,0,0,18,18,0,0,...,0,18,18,0,0,0,0,2.278,41,41
1,2,2016,35,35,0,0,34,34,0,0,...,0,34,34,0,0,0,0,2.382,81,81
2,3,2016,52,52,0,0,52,52,0,0,...,0,52,52,0,0,0,0,2.135,111,111
3,4,2016,30,30,0,0,30,30,0,0,...,0,30,30,0,0,0,0,2.433,73,73
4,5,2016,28,28,0,0,28,28,0,0,...,0,28,28,0,0,0,0,2.250,63,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299021,22998,2050,90,90,0,0,87,87,0,0,...,0,87,87,0,0,0,0,2.506,218,218
299022,22999,2050,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.000,0,0
299023,23000,2050,131,131,0,0,126,126,0,0,...,0,126,126,0,0,0,0,2.476,312,312
299024,23001,2050,83,83,0,0,81,81,0,0,...,0,81,81,0,0,0,0,2.407,195,195


In [36]:
writer42 = pd.ExcelWriter('C:/Users/psi/Desktop/Housing_DS42.xlsx')   
region42.to_excel(writer42, sheet_name = 'Region', index = False)
juris42.to_excel(writer42, sheet_name = 'Jurisdiction', index = False)
cpa42.to_excel(writer42, sheet_name = 'CPA', index = False)
writer42.save()
writer42.close() 

  warn("Calling close() on already closed file.")


In [38]:
ds41_42_final.to_csv("C:/Users/psi/Desktop/Housing_41_42_mgra.csv", index=False)
ds42_final.to_csv("C:/Users/psi/Desktop/Housing_38_42_mgra.csv", index=False)