In [1]:
import geopandas as gpd
import shapely

import pandas as pd
import numpy as np
from datetime import datetime, timedelta



In [2]:
def fix_missing_bnl_value ( pts, bnl ) :

    ## fixed the hardway
    
    updates = [
        {"effective_date":"2018-12-10","cost":22043.0},
        {"effective_date":"2018-12-17","cost":44951.0},
        {"effective_date":"2019-01-04","cost":33150},
        {"effective_date":"2019-01-15","cost":27007},
        {"effective_date":"2019-01-23","cost":27665},
        {"effective_date":"2019-03-15","cost":27977.25},
        {"effective_date":"2019-03-22","cost":31480},
        {"effective_date":"2019-03-26","cost":30732.80},
        {"effective_date":"2019-04-04","cost":22238},
        {"effective_date":"2019-10-07","cost":41898}
    ]
    
    for idx in range ( len ( updates ) ) :

        pts_mask = (pts.effective_date==updates[idx]['effective_date']) & (pts['cost']==updates[idx]['cost'])
        bnl_mask = (bnl.effective_date==updates[idx]['effective_date']) & (bnl.total_installed_price==-1)

        bnl.loc[bnl_mask,'total_installed_price'] = pts.loc[pts_mask,'cost'].values
        
        return bnl
    

In [10]:
def get_sources_from_extracts(data_dir = '/data/energy/REC/MA/Arlington/'):

    rps             =  pd  . read_csv  ( data_dir + 'rps.tsv' , sep = '\t'  )
    pts             =  pd  . read_csv  ( data_dir + 'pts.tsv' , sep = '\t'  )
    bnl             =  pd  . read_csv  ( data_dir + 'bnl.tsv' , sep = '\t'  )
    solar_systems   =  gpd . read_file ( data_dir + 'solar_systems.geojson' )   ##truth, tokenId is key

    # solar_systems.tokenId.apply(type).unique()  change from int to string to perserve precision
    solar_systems.tokenId = solar_systems.tokenId.astype(str)
    solar_systems.permit_effective_date = pd.to_datetime(solar_systems.permit_effective_date)


    rps . effective_date  =  pd . to_datetime ( rps . effective_date )
    pts . effective_date  =  pd . to_datetime ( pts . effective_date )
    bnl . effective_date  =  pd . to_datetime ( bnl . effective_date )

    for col in bnl.columns:
        mask = (bnl[col] == '-1') | (bnl[col] == '-1.0')
        bnl.loc[mask,col] = ''
        mask = (bnl[col] == -1) | (bnl[col] == -1.0)
        bnl.loc[mask,col] = np.nan
    
    
    return rps, pts, bnl, solar_systems

In [11]:
def run_rps_pts_comparison_tests(rps_pts):

    ## 925 out of 1082, most are rps installs not yet in pts, see above
    mask = rps_pts.source == 'both'

    # 259 differences out of 925 merged rps/pts
    # 139 more than 10 days apart
    test = rps_pts.effective_date_rps!= rps_pts.effective_date_pts

    test = (rps_pts.effective_date_rps - rps_pts.effective_date_pts).apply(np.abs) > timedelta(days=60)
    cols = ['rps_index','pts_index','effective_date_rps','effective_date_pts']
    print('\nDifferences in Effective Date (>60 days)\n')
    print(rps_pts[mask & test][cols].to_markdown())

    test = rps_pts.kW_rps!= rps_pts.kW_pts
    test = (rps_pts.kW_rps- rps_pts.kW_pts).apply(np.abs)>=0.0051

    cols = ['rps_index','pts_index','kW_rps','kW_pts']
    print('\nDifferences in Capacity (>0.0051kW)\n')
    print(rps_pts[mask & test][cols].to_markdown())


    cols = ['rps_index','pts_index','cost_rps','cost_pts']

    test = rps_pts.cost_rps!= rps_pts.cost_pts
    # 11 >$10, 9>$100, 7>$1000
    test = (rps_pts.cost_rps- rps_pts.cost_pts).apply(np.abs)>1000
    print('\nDifferences in Cost (>$1000)\n')
    print(rps_pts[mask & test][cols].to_markdown())


def merge_rps_pts(rps, pts, data_dir = '/data/energy/REC/MA/Arlington/'):
    
    rps_pts_xref = pd . read_csv ( data_dir + 'rps_pts_xref.tsv' , sep = '\t' )
    print('RPS dups:',rps_pts_xref.rps.duplicated().any(),'\nPTS dups:',rps_pts_xref.pts.duplicated().any())


    rps = rps.merge(rps_pts_xref,right_on='rps',left_index=True,how='left').reset_index(drop=True)
    pts = pts.merge(rps_pts_xref,right_on='pts',left_index=True,how='left').reset_index(drop=True)

    ##???
    rps_pts = rps.merge(pts,left_on='pts',right_index=True,how='outer',indicator=True).reset_index(drop=True)

    rps_pts.columns=rps_pts.columns \
                    .str.replace('_x$','_rps',regex=True) \
                    .str.replace('_y$','_pts',regex=True) \
                    .str.replace('^rps_rps$','rps_index',regex=True) \
                    .str.replace('^pts$','pts_index',regex=True)

    rps_pts['source'] = rps_pts._merge.astype(str)
    #rps_pts.columns = rps_pts.columns.str.replace('_merge','source')

    mask = rps_pts.source=='left_only'
    rps_pts.loc[mask,'source'] = 'rps'

    mask = rps_pts.source=='right_only'
    rps_pts.loc[mask,'source'] = 'pts'

    rps_pts.drop(['rps_pts','pts_pts','pts_rps','_merge'],axis=1,inplace=True)
    
    mask = rps_pts.source == 'both'

    for scalar in ['installer', 'owner', 'type', 'city', 'zip', 'utility']:

        rps_pts[scalar]=rps_pts[scalar+'_rps']  #default to rps

        test = rps_pts[scalar+'_rps']!= rps_pts[scalar+'_pts']
        print(scalar,'#diffs=',len(rps_pts[mask&test]))
        more = pd.isnull(rps_pts[scalar])
        rps_pts.loc[more, scalar] = rps_pts.loc[more,scalar+'_pts']
        rps_pts.drop([scalar+'_rps',scalar+'_pts'],axis=1,inplace=True)



    ## fix zipcode, missing leading '0' for Arlington and others but not all
    mask = rps_pts.zip.astype(str).apply(len) ==4
    rps_pts.loc[mask,'zip'] = '0' + rps_pts['zip'].astype(str)


    rps_pts.name=rps_pts.name.astype(str)
    
    run_rps_pts_comparison_tests(rps_pts)    

    empty_columns = ['location','location_tranche', 'off_taker', 'off_taker_tranche', 'tracking','tracking_tranche', 'pollinator', 'pollinator_tranche']
    cols = rps_pts.columns
    for col in cols:
        mask = ~pd.isnull(rps_pts[col])
        if len(rps_pts[mask])==0:
            print('dropping',col)
            rps_pts.drop(col,axis=1,inplace=True)

    droppers = ['project','perWatt_rps','perWatt_pts','install_yr','distributer']
    for col in droppers:
        rps_pts.drop(col,axis=1,inplace=True)
    rps_pts.columns

    rps_pts['kW'] = rps_pts['kW_rps']
    mask = pd.isnull(rps_pts['kW'])
    rps_pts.loc[mask,'kW'] = rps_pts.loc[mask,'kW_pts']

    rps_pts['cost'] = rps_pts['cost_rps']
    mask = pd.isnull(rps_pts['cost'])
    rps_pts.loc[mask,'cost'] = rps_pts.loc[mask,'cost_pts']

    rps_pts.drop(['kW_rps','kW_pts','cost_rps','cost_pts'],axis=1,inplace=True)

    rps_pts[rps_pts.duplicated('pts_index',keep=False)]
    print('null rps_ids',len(rps_pts[pd.isnull(rps_pts.rps_id)]))

    
    return rps_pts


In [17]:
data_dir = '/data/energy/REC/MA/Arlington/'
solar_systems   =  gpd . read_file ( data_dir + 'solar_systems.geojson' )
solar_systems.columns#iloc[0]['tokenId']

Index(['tokenId', 'image', 'name', 'description', 'attributes', 'entity',
       'financials', 'dates', 'location', 'amps', 'watts', 'joules', 'ohm',
       'xref', 'governance', 'geometry2', 'geometry'],
      dtype='object')

In [12]:
rps, pts, bnl, solar_systems = get_sources_from_extracts()
bnl = fix_missing_bnl_value ( pts, bnl )
bnl['bnl_index'] = bnl.index
rps_pts = merge_rps_pts(rps, pts, data_dir = '/data/energy/REC/MA/Arlington/')

print ( "RPS: {rps}\nPTS: {pts}\nBNL: {bnl}\nPermits: {ss}".format(rps=len(rps),pts=len(pts),bnl=len(bnl),ss=len(solar_systems)) )
print ( "\nAs of 12/31/2020\n")
print ( "RPS: {rps}\nPTS: {pts}\nBNL: {bnl}\nPermits: {ss}".format(rps=len(rps[rps.effective_date<='2020-12-31']),
                                                                   pts=len(pts[pts.effective_date<='2020-12-31']),
                                                                   bnl=len(bnl[bnl.effective_date<='2020-12-31']),
                                                                   ss=len(solar_systems[solar_systems.permit_effective_date<='2020-12-31'])) )

AttributeError: 'GeoDataFrame' object has no attribute 'permit_effective_date'

In [None]:
##first round, merge on SMART project, renamed rps_id and BNL system_ID_1; about 170 matches
mask = (~pd.isnull(rps_pts.rps_id)) #&(rps_pts.source=='both')
foo=rps_pts[mask].merge(bnl,how='left',left_on=['rps_id'],right_on='system_ID_1',indicator=True)

combo = foo[foo._merge=='both']

In [None]:
combo.duplicated('rps_index').any()

In [None]:
##second round
rps_mismatch = foo[foo._merge=='left_only']
rps_mismatch.columns=rps_mismatch.columns.str.replace('_x$','',regex=True)
rps_mismatch = rps_mismatch[rps_pts.columns]

foo=rps_mismatch.merge(bnl,how='left',left_on=['effective_date_pts','cost'],right_on=['effective_date','total_installed_price'],indicator=True)

##dups on merge
mask =  ((foo.pts_index==741) & (foo.bnl_index==426))  | \
        ((foo.pts_index==744) & (foo.bnl_index==423))  |  \
        ((foo.pts_index==738) & (foo.bnl_index==432))  |   \
        ((foo.pts_index==739) & (foo.bnl_index==430))  |    \
        ((foo.pts_index==436) & (foo.bnl_index==667))  |     \
        ((foo.pts_index==437) & (foo.bnl_index==666))  |      \
        ((foo.pts_index==322) & (foo.bnl_index==821))  |       \
        ((foo.pts_index==321) & (foo.bnl_index==784))  

foo = foo[~mask]

In [None]:
combo = combo.append(foo[foo._merge=='both'])
combo.drop('_merge',axis=1,inplace=True)

rps_mismatch = foo[foo._merge=='left_only']
rps_mismatch.columns=rps_mismatch.columns.str.replace('_x$','',regex=True)
rps_mismatch = rps_mismatch[rps_pts.columns]
rps_mismatch

bnl_mismatch = foo[foo._merge=='right_only']
bnl_mismatch.columns=bnl_mismatch.columns.str.replace('_y$','',regex=True)
bnl_mismatch = bnl_mismatch[bnl.columns]
bnl_mismatch

In [None]:
mask = ((rps_mismatch.status == 'Approved') | (pd.isnull(rps_mismatch.status))) &\
    ((rps_mismatch.effective_date_rps<='2020-12-31')|(rps_mismatch.effective_date_pts<='2020-12-31'))
print('combo',len(combo),'combo mismatched',len(rps_mismatch[mask]))

print("BNL:{bnl},COMBO:{combo}".format(bnl=len(bnl),combo=len(combo)))

##missing, 40>31 mist be some dups in combo??
list(combo.columns)
mask = bnl.bnl_index.isin(list(combo.bnl_index))
print('bnl mismatched',len(bnl[~mask]))

In [None]:
mask = combo.city_x!=combo.city_y
if len(combo[mask])==0:
    combo['city'] = combo['city_x']
    combo.drop(['city_x','city_y'],axis=1,inplace=True)
else:
    print('mistmatch on city',len(combo[mask]))

In [None]:
##15 instances of BNL and PTS disagree on zipcode, use PTS and fix zip code
combo['zipcode'] = combo['zip_code'].astype(str)

mask = combo['zipcode'].apply(len) ==4
combo.loc[mask,'zipcode'] = ('0' + combo.loc[mask,'zipcode'].astype(str))

mask = combo['zip']!=combo.zip_code
print(combo[mask][['zip','zip_code','zipcode']].to_markdown())

combo.drop(['zip','zip_code'],axis=1,inplace=True)

In [None]:
metadata_keys = {
    "attributes": [
        'expansion_system',
        'multiple_phase_system',
        'new_construction',
        'tracking',
        'ground_mounted',
        'third_party_owned',
        'self_installed'
    ],

    "entity" : [
        'applicant',
        'ownership_type',
        'status',
        'capacity_block',
        'installer',
        'owner',
        'type',
        'sector',
        'subsector',
        'name',
        'program',
        'customer_segment',
        'installer_name',
        'aggregation',
        'low_income',
    ],
    
    "financials" : [
        'ownership_type',
        'applicant',
        'program',
        'contractor',
        'installer',
        'srec',
        'srec_factor',
        'cost',
        'total_installed_price',
        'rebate_or_grant',
        'grant',
        'permits',
        'issued',
        'descriptions',
        'value',
        'fee',
    ],

    "dates" : [
        'effective_date',
        'effective_date_rps',
        'effective_date_pts',
        'effective_date_permit',
        'installation_date',
        'expiration_date',
        'operation_date',
        'sq_date',
        'qualification_date',
        'dateOfBatteryInstall'
    ],

    "location" : [
        'coord',
        "street_number",
        "street_name",
        "unit" ,
        'city',
        'county',
        'state',
        'zipcode'
    ],

    
    "amps": [
        'distributor',
        'utility',
        'utility_service_territory',
        'meter_mfgr',
        'meter_type',
        'interconnection'

    ],

    "watts": [
        "kW",
        "system_size_DC",
        "est_annual_kWh",
        "module_mfgr",
        "inverter_mfgr",
        "size",
        "kW_ac",
        "additional_modules",
        "additional_inverters",
        'DC_optimizer',
        'inverter_loading_ratio',
    ],

    "joules": [
        'storage',
        'storage_tranche',
        'storage_kVa',
        'storage_duration',
        'battery_manufacturer',
        'battery_model',
        'battery_rated_capacity_kW',
        'battery_rated_capacity_kWh'       
    ],

    "ohm" : [
        "parcel_size" ,
        "style",
        "year_built",
        "stories",
        "units",
        "rooms",
        "gross_area",
        "living_area",
    ],
    
    "xref" : [
        'pts_index',
        'rps_index',
        'bnl_index',
    ],
    
    "governance" : [
        "parcel",
        "land_use",
        "land_use_code",
        "zoning",
        "location",
        "map",
        "plan",
        "cama",
        "deed_book",
        "deed_page",
        "rps_id",
        "nepool_id", 
        'data_provider_1',
        'data_provider_2',
        'system_ID_1',
        'system_ID_2',
    ] ,
}
#list(combo.columns)


In [None]:
for col in ['pts_index','rps_index','bnl_index']:
    combo[col] = combo[col].astype(int)

In [None]:
rps[rps.name.str.contains('FranSullivan')==True][['cost']]
##bnl_index has a dup at #416
combo[combo.duplicated('pts_index',keep=False)]#[identity+dates+financials]

In [None]:
##single instance; drop system_size_DC but just here?
mask = combo.kW!=combo.system_size_DC
print(combo[mask][['kW','system_size_DC']].to_markdown())

In [None]:
combo.reset_index(drop=True).to_csv(data_dir+'rps_pts_bnl.tsv',sep='\t',index=False)

In [None]:
def hand_job():

    p1 = solar_systems.sort_values(['effective_date','permit_value','owner','tokenId']).reset_index()

    p2 = combo.sort_values(['effective_date','cost']).reset_index()

    if p1.duplicated('tokenId').any():
        print('Dups in permits!!')

    p1[['effective_date','permit_value','owner','tokenId']]
    p1.to_csv('/data/energy/REC/MA/Arlington/p1.tsv',sep='\t')
    p2.to_csv('/data/energy/REC/MA/Arlington/p2.tsv',sep='\t')


In [None]:
pts_permits_xref = pd.read_csv('/data/energy/REC/MA/Arlington/pts_permits_xref_20211217.tsv',sep='\t')
mask = (~pd.isnull(pts_permits_xref.pts_index)) #& (~pts_permits_xref.pts_index.str.contains('\&'))
pts_permits_xref[mask].pts_index.apply(type).unique()
secondaries = pts_permits_xref[mask][pts_permits_xref[mask].pts_index.str.contains('&')]
print(secondaries.to_markdown())

pts_permits_xref = pts_permits_xref[mask][~pts_permits_xref[mask].pts_index.str.contains('&')]

In [None]:
scols = ['ADDR_NUM', 'FULL_STR', 'LOCATION', 'MAP_PAR_ID', 'LOC_ID', 
       'MAP_NO',  'PLAN_ID', 'CAMA_ID','OWNER1',
       'permits', 'descriptions', 'contractor',
       'owner', 'permit_value', 'permit_fee', 'issued', 'effective_date',
        'tokenId', 'coord', 'image', 'geometry','shared',
         'LOT_SIZE', 'LS_DATE', 'LS_PRICE', 'USE_CODE',
        'LS_BOOK', 'LS_PAGE', 'ZONING', 'YEAR_BUILT', 'BLD_AREA',
       'UNITS', 'RES_AREA', 'STYLE', 'STORIES', 'NUM_ROOMS',
       ]
ss = solar_systems[scols].copy()

scols = ['street_number', 'street_name', 'unit', 'parcel', 'location', 
       'map', 'plan', 'cama','property_owner',
       'permits', 'descriptions', 'contractor',
       'permit_owner', 'value', 'fee', 'issued', 'effective_date_permit',
        'tokenId', 'coord', 'image','geometry','shared_install',
         'parcel_size', 'last_sale_date', 'last_sale_price', 'land_use_code',
        'deed_book', 'deed_page', 'zoning', 'year_built', 'gross_area',
       'units', 'living_area', 'style', 'stories', 'rooms',
       ]

ss.columns = scols

In [None]:
pts_permits_xref.tokenId = pts_permits_xref.tokenId.astype(str)
pts_permits_xref.pts_index = pts_permits_xref.pts_index.astype(int)
#solar_systems.drop("_merge",axis=1,inplace=True)
#cols = identity+location+entity+attributes+dates+financials+watts+amps+joules


super_combo = ss \
    .merge(pts_permits_xref,on='tokenId',how='left',indicator=True) \
    .merge(combo,on='pts_index',how='outer')


In [None]:
data = super_combo[super_combo._merge=='both'] . reset_index ( drop = True )
# for col in data.columns:
#     print(col)
#     data.loc[data[col]==-1,col] = np.nan

In [None]:
for col in metadata_keys['dates']:
    print(col,data[col].apply(type).unique())
    
for col in ['effective_date','effective_date_rps','effective_date_pts','effective_date_permit']:
    data[col] = pd.to_datetime(data[col]).dt.strftime('%Y-%m-%d')
    
for col in metadata_keys['dates']:
    print(col,data[col].apply(type).unique())


In [None]:
def convert_to_dict( df, cols):
    return {k: v for k, v in df[cols].to_dict().items() if (v is not np.nan) and (v != '') and (v==v)}


PV          =  [ ] 

for jdx in range(len(data)):
    
    row = data.iloc[jdx]

    metadata = {
        "tokenId": row.tokenId,
        "image" : row.image,
        "name": row.street_number + ' ' + row.street_name,
        "description": "A {kW}kW photovoltaic system was installed at {address} on {effective_date} at a cost of {cost} by {installer} through the {program}". \
            format(kW=row.kW,
                   address=row.street_number + ' ' + row.street_name,
                   effective_date=row.effective_date,
                   cost=row.cost,
                   installer=row.installer,
                   program=row.program
                  )
    }
        

    for key in metadata_keys.keys():
        metadata[key] = convert_to_dict(row,metadata_keys[key])


    cols = {
        "modules" : {
            "cols" : ["azimuth" , "tilt" , "manufacturer" , "model", "quantity" , "technology" , "BIPV", "bifacial" , 
                      "nameplate_capacity" , "efficiency"],
            "dicts": ["azimuth" , "tilt" , "module_manufacturer" , "module_model", "module_quantity" , "technology_module" , "BIPV_module", 
                      "bifacial_module" , "nameplate_capacity_module" , "efficiency_module" ]
        },
        "inverters" : {
            "cols" : ["manufacturer",    "model" ,    "quantity" ,    "micro",    "solar_storage_hybrid",    "built_in_meter",    "output_capacity"],
            "dicts": ["inverter_manufacturer",    "inverter_model" ,    "inverter_quantity" ,    "micro_inverter",    "solar_storage_hybrid_inverter",    
                      "built_in_meter_inverter",    "output_capacity_inverter"]        
        }
    }


    token_type='watts' 

    for sub_dicts in ['modules','inverters']:

        metadata[token_type][sub_dicts] = [ ]

        xref = dict(zip(cols[sub_dicts]["cols"] , cols[sub_dicts]["dicts"]))

        for idx in range(3):
            x_dict = dict ( )
            for col in cols[sub_dicts]["cols"]:
                x = row[xref[col]+'_'+str(idx+1)]#.value
                if x != None:
                    if x !='':
                        if x !=np.nan:
                            if x==x:
                                x_dict[col] = x

            if len(x_dict)>0:
                metadata[token_type][sub_dicts].append(x_dict)
                
    metadata["geometry"] = row.geometry  
    metadata["geometry2"] = shapely.wkt.dumps(row.geometry, rounding_precision=7)   

    PV.append(metadata)

In [None]:
data_dir = '/data/energy/REC/MA/Arlington/'

gpd . GeoDataFrame ( pd . DataFrame . from_dict ( PV ) ) . to_file ( data_dir + 'PV_complete.geojson' )

print('#PVs',len(PV))
pd . DataFrame . from_dict ( PV ) .reset_index(drop=True).to_csv(data_dir+'PV_complete.tsv',sep='\t',index=False)

In [None]:
pd . DataFrame . from_dict ( PV ) 
shaft = data[['tokenId','street_number','street_name','value','cost','installer']].copy()

shaft['diff']=shaft.cost-shaft.value

(30*shaft.groupby('installer').agg({'diff':sum}).sort_values('diff')/1000).sum()

In [None]:
unmatched_permits = super_combo[super_combo._merge=='left_only'][ss.columns].sort_values('effective_date_permit').reset_index(drop=True)
unmatched_permits['effective_date_permit'] = pd.to_datetime(unmatched_permits['effective_date_permit']).dt.strftime('%Y-%m-%d')


print('complete',len(PV),'unmatched',len(unmatched_permits))

In [None]:
permits_keys = {
    "financials" : [
        'contractor',
        'permits',
        'issued',
        'descriptions',
        'value',
        'fee',
    ],

    "dates" : [
        'effective_date_permit',
    ],

    "location" : [
        'coord',
        "street_number",
        "street_name",
        "unit" ,
    ],

    "ohm" : [
        "parcel_size" ,
        "style",
        "year_built",
        "stories",
        "units",
        "rooms",
        "gross_area",
        "living_area",
    ],
    
    "governance" : [
        "parcel",
        "land_use_code",
        "zoning",
        "location",
        "map",
        "plan",
        "cama",
        "deed_book",
        "deed_page",
    ] ,
}


In [None]:
unmatchedPV          =  [ ] 

for jdx in range(len(unmatched_permits)):
    
    row = unmatched_permits.iloc[jdx]

    metadata = {
        "tokenId": row.tokenId,
        "image" : row.image,
        "name": row.street_number + ' ' + row.street_name,
        "description": "A photovoltaic system was installed at {address} on {effective_date} at a cost of {cost} by {installer}". \
            format(address=row.street_number + ' ' + row.street_name,
                   effective_date=row.effective_date_permit,
                   cost=row.value,
                   installer=row.contractor
                  )
    }
        

    for key in permits_keys.keys():
        metadata[key] = convert_to_dict(row,permits_keys[key])
                
    metadata["geometry"] = row.geometry  
    metadata["geometry2"] = shapely.wkt.dumps(row.geometry, rounding_precision=7)   

    unmatchedPV.append(metadata)

In [None]:
len(unmatchedPV)
pd . DataFrame . from_dict ( unmatchedPV ) .reset_index(drop=True).to_csv(data_dir+'PV_unmatched.tsv',sep='\t',index=False)

gpd . GeoDataFrame ( pd . DataFrame . from_dict ( PV+unmatchedPV ) ) . to_file ( data_dir + 'solar_systems.geojson' )

both = pd . DataFrame . from_dict ( PV+unmatchedPV ) 

In [None]:
def defunct_debug():

    ##rps only, check status of Approved only
    mask = (rps_pts._merge=='left_only') & (rps_pts.status=='Approved')
    cols = ['rps_rps','kW_rps','effective_date_rps','cost_rps']
    rps_pts[mask][cols]#.to_csv(data_dir+'rps_only_for_match.tsv',sep='\t',index=False)#[['pts','rps_rps']]

    ##pts only, check status of Approved only
    ##22 not matched
    mask = (rps_pts._merge=='right_only') #& (rps_pts.status=='Approved')
    cols = ['pts','kW_pts','effective_date_pts','cost_pts']
    rps_pts[mask][cols]#.to_csv(data_dir+'pts_only_for_match.tsv',sep='\t',index=False)#[['pts','rps_rps']]
    
    
def defunctioin_name_matching() :

    len(combo)
    nn = pd.read_csv('/data/energy/REC/MA/name_fix.txt',sep='\t')
    combo.sort_values('rps_index').name

    nn = pd.read_csv('/data/energy/REC/MA/name_fix.txt',sep='\t')
    combo['name_fix'] = nn
    combo['name_fix'] = nn


    mask = (pd.isnull(combo.name_fix))
    mask.any()
    mask = (~pd.isnull(combo.name_fix))#&(new_arl_combo.name_fix!='nan')
    #names = new_arl_combo[mask].sort_values('effective_date_rps').name.str.replace('Arlington|Residence','').str.split('([A-Z])')
    names = combo.name_fix.str.split('([A-Z])')
    rps_names = (names.str[-2]+names.str[-1]).str.upper()
    combo.loc[:,'last_name'] = rps_names

    ## COMBINE with permits!!!
    for idx in range(len(combo)):
        rps_owner = new_arl_combo.loc[idx,'last_name'] 
        cost      = new_arl_combo.loc[idx,'cost_rps'] 
        date      = new_arl_combo.loc[idx,'effective_date_rps'] 
        if rps_owner == rps_owner:
            mask = (solar_systems.owner.str.upper().str.contains(rps_owner)) &\
                ((date - pd.to_datetime(solar_systems.effective_date)<timedelta(days=30)) |\
                ((pd.to_datetime(solar_systems.effective_date) - date)<timedelta(days=30)) )
            #&(np.abs(solar_systems['permit_value']-cost)<1000)
            if len(solar_systems[mask])==1:
                new_arl_combo.loc[idx,'tokenId']=int(solar_systems[mask].tokenId)
            elif len(solar_systems[mask])>1:
                print(idx,len(mask),'dups',rps_owner,cost,date)

##defunct hand merger for rps, pts

def rps_pts_manual_merge():  ##defunct?  one time?

    cols = ['kW','effective_date','cost']
    arl_pts_matcher = arl_pts[cols].copy().reset_index()#.to_csv('/data/energy/REC/MA/arl_pts.tsv',sep='\t')
    arl_rps_matcher = arl_rps[cols].copy().reset_index()#.to_csv('/data/energy/REC/MA/arl_rps.tsv',sep='\t')

    arl_rps_matcher.columns = ['rps','kW_rps','effective_date_rps','cost_rps']
    arl_rps_matcher.effective_date_rps = pd.to_datetime(arl_rps_matcher.effective_date_rps)

    arl_pts_matcher.columns = ['pts','kW_pts','effective_date_pts','cost_pts']
    arl_pts_matcher.effective_date_pts = pd.to_datetime(arl_pts_matcher.effective_date_pts)


    ## also use kW match <=0.005
    foo=arl_rps_matcher.merge(arl_pts_matcher,right_on='cost_pts',left_on='cost_rps',indicator=True)
    foo.to_csv('/data/energy/REC/MA/arl_rps_pts_join.tsv',sep='\t')
    #[arl_rps_matcher.duplicated()]

    rps_pts_xref = pd.read_csv('/data/energy/REC/MA/rps_pts_xref.tsv',sep='\t')
    print('RPS dups:',rps_pts_xref.rps.duplicated().any(),'\nPTS dups:',rps_pts_xref.pts.duplicated().any())

    foo=arl_rps.reset_index(drop=True)
    foo['rps_index'] = foo.index.astype(int)
    foo

    foo2 = arl_pts.reset_index(drop=True)
    foo2['pts_index'] = foo2.index.astype(int)

    foo = foo.merge(rps_pts_xref,how='outer',left_on='rps_index',right_on='rps')

    foo = foo.merge(foo2,how='outer',left_on='pts',right_on='pts_index')

    foo.to_csv('/data/energy/REC/MA/rps_pts_merged.tsv',sep='\t',index=False)  #hand job exceptions
    foo[(foo.rps_index.duplicated())&(~pd.isnull(foo.rps_index))]