# Exploration of Employment Inventory Data

In [1]:
"""
explore_raw_empdata.ipynb
Purpose:
    Basic wrangling and exploration of the raw Data Axle employment data

"""

import pandas as pd

# STEP 1: LOAD DATAFRAME AND GIVE IT MORE INTUITIVE FIELD NAMES

cols_raw = ['CONAME', 'LOCNUM', 'SITE', 'STADDR', 'STCITY', 'ZIP',
            'CNTYCD', 'CCNTY', 'NAICS', 'NAICSD', 'LOCEMP', 
            'YREY2K', 'LMDATE', 'LNGNAM', 'latitude', 'longitude', 'geo_level']

cols_rename = ['CompName', 'LocID', 'SITE', 'Address', 'City', 'ZIP', 
        'CountyID', 'CntyCensus', 'NAICS', 'NAICSD', 'EmpCnt_LocID', 
        'YearEstd', 'DateModified', 'LongName', 'latitude', 'longitude', 'geo_level']

col_rename_dict = dict(zip(cols_raw, cols_rename))

def memory_optimization(in_df):
    '''Takes in a pandas dataframe and converts it to have optimized, low-memory
    data types'''
    # common default data types that can be reduced to more memory-efficient type
    dtype_obj = 'object'
    dtypes_int = ['int64', 'uint64']
    dtypes_float = ['float64']
    
    # category dtype is far more efficient way to store strings if not many unique string values.
    dtype_category = 'category'
    downcast_float = 'float'
    downcast_int = 'integer'
    
    for col in in_df.columns:
        start_dtype = in_df[col].dtype
        if start_dtype in dtypes_int:
            in_df[col] = pd.to_numeric(in_df[col], downcast=downcast_int) # sets to biggest size necessary, not biggest size possible
        elif start_dtype in dtypes_float:
            in_df[col] = pd.to_numeric(in_df[col], downcast=downcast_float)
        elif start_dtype == dtype_obj:
            # if number of unique string vals is less than 40% of the total number of vals in column,
            # then recode as category instead of string, which will save significant memory
            if len(in_df[col].unique()) / len(in_df[col]) < 0.4:
                in_df[col] = in_df[col].astype(dtype_category)
            else:
                continue
        else:
            continue
            
raw_csv = r"P:\Employment Inventory\Employment 2020\Data Axle Raw - DO NOT MODIFY\SACOG Jan 2020.csv"
df_raw = pd.read_csv(raw_csv, usecols=cols_raw)

# change datatypes to significantly reduce df memory usage
memory_optimization(df_raw)
df_raw.columns

Index(['CONAME', 'LOCNUM', 'SITE', 'STADDR', 'STCITY', 'ZIP', 'CNTYCD',
       'CCNTY', 'NAICS', 'NAICSD', 'LOCEMP', 'YREY2K', 'LMDATE', 'LNGNAM',
       'latitude', 'longitude', 'geo_level'],
      dtype='object')

In [10]:
df_raw.loc[df_raw['CONAME'] == 'U C Davis Medical Ctr']

Unnamed: 0,CONAME,LOCNUM,SITE,STADDR,STCITY,ZIP,CNTYCD,CCNTY,NAICS,NAICSD,LOCEMP,YREY2K,LMDATE,LNGNAM,latitude,longitude,geo_level
22298,U C Davis Medical Ctr,361358500,,4150 V St # 1200,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201503.0,U C DAVIS MEDICAL CENTER,38.556305,-121.456680,0
22342,U C Davis Medical Ctr,361457369,,4150 V St # 1200,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201701.0,U C DAVIS MEDICAL CENTER,38.556305,-121.456680,0
22404,U C Davis Medical Ctr,361558380,,4150 V St # 1200,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201612.0,U C DAVIS MEDICAL CENTER,38.556305,-121.456680,0
22406,U C Davis Medical Ctr,361568140,,4150 V St # 1200,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201701.0,U C DAVIS MEDICAL CENTER,38.556305,-121.456680,0
22413,U C Davis Medical Ctr,361571862,,4150 V St # 1200,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201701.0,U C DAVIS MEDICAL CENTER,38.556305,-121.456680,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221517,U C Davis Medical Ctr,699233631,,2315 Stockton Blvd,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201701.0,U C DAVIS MEDICAL CENTER,38.554291,-121.455177,P
221526,U C Davis Medical Ctr,699244828,,2315 Stockton Blvd,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201701.0,U C DAVIS MEDICAL CENTER,38.554291,-121.455177,P
221530,U C Davis Medical Ctr,699248449,,2315 Stockton Blvd,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201701.0,U C DAVIS MEDICAL CENTER,38.554291,-121.455177,P
221540,U C Davis Medical Ctr,699257184,,2315 Stockton Blvd,Sacramento,95817,67,67,62111107,Offices of Physicians (exc Mental Health Specs),3,,201710.0,U C DAVIS MEDICAL CENTER,38.554291,-121.455177,P


## Calibrating fuzzywuzzy threshold for matching
Fuzzywuzzy checks how similar 2 strings are to each other: `fuzz.ratio` gives a score from zero to 100. This section aims to give a nice balance threshold to answer the question "how similar should strings be before we consider them the same?" 

In [11]:
from fuzzywuzzy import fuzz

test_string_val = 'U C Davis Medical Ctr'
name_col = 'CONAME'

# get unique name values
df_names = pd.DataFrame(df_raw[name_col].astype('str').drop_duplicates())
df_names['test_str'] = test_string_val

def get_lev(t_str, check_val):
    return fuzz.ratio(t_str, check_val)

df_names['lev_ratio'] = df_names[name_col].apply(lambda x: get_lev(test_string_val, x))

df_names.head()




Unnamed: 0,CONAME,test_str,lev_ratio
0,Sears Hometown Store,U C Davis Medical Ctr,34
1,JC Penney,U C Davis Medical Ctr,20
2,Lane Bryant,U C Davis Medical Ctr,25
3,Amalgamated Transit Union,U C Davis Medical Ctr,22
4,Pacific Coast Producers,U C Davis Medical Ctr,36


In [13]:
# see what different levenshtein scores look like

df_names.shape

df_names.loc[df_names['lev_ratio'] > 75] \
    .sort_values('lev_ratio', ascending=False)


Unnamed: 0,CONAME,test_str,lev_ratio
22298,U C Davis Medical Ctr,U C Davis Medical Ctr,100
25467,UC Davis Medical Ctr,U C Davis Medical Ctr,98
31349,Uc Davis Medical Ctr,U C Davis Medical Ctr,93
35345,U C Davis Medical Ctr-Peds,U C Davis Medical Ctr,89
157295,Davis Medical Ctr,U C Davis Medical Ctr,89
2254,U C Davis Medical Group,U C Davis Medical Ctr,86
30654,Univ of Ca Davis Medical Ctr,U C Davis Medical Ctr,86
30751,UC Davis Medical Group,U C Davis Medical Ctr,84
52154,Davis Uc Medical Ctr,U C Davis Medical Ctr,83
374690,Uc Davis Med Ctr,U C Davis Medical Ctr,81


In [14]:
from fuzzywuzzy import fuzz

fuzz.ratio('AARON D ANDERSON DO', 'AARON R DANIELSON MD')

77

In [45]:
# Do any company names have multiple DISTINCT geo levels, even if they have the same address?

# need df with distinct co name, address, and geo-levels
df_d = df_raw[['CompName', 'Address', 'City', 'geo_level']].drop_duplicates()

df_d_g = df_d.groupby(['CompName', 'Address', 'City'], observed=True).count()

# result: max value = 1, so there are no company locations duplicated due to having separate points for the 
# parcel vs. entry point location
df_d_g.geo_level.max()

1

In [61]:
# get a sample of similar strings to test fuzzywuzzy module

df_k = df_raw.loc[df_raw['CompName'].str.contains('Kaiser', na=False)][['CompName', 'SITE', 'Address', 'City', 'ZIP',  'NAICS', 'NAICSD', 'EmpCnt_LocID',
       'YearEstd', 'DateModified', 'DateVerified','ADDDAT', 'DateChange', 'latitude', 'longitude'
      ,'geo_level']].drop_duplicates()

df_k = df_k.sort_values(by=['CompName', 'Address'])
df_k.shape

df_k.to_csv(r'I:\Projects\Darren\EmpInventory\CSV\kaiser.csv')

# df_k[['CompName', 'SITE', 'Address', 'City', 'ZIP',  'NAICS', 'NAICSD', 'EmpCnt_LocID',
#        'YearEstd', 'DateModified', 'DateVerified','ADDDAT', 'DateChange', 'latitude', 'longitude'
#       ,'geo_level']].head(20)

In [105]:
# distinct lat-longs
# if you added an integer col to master table with "lat long ID" for each row, what would that accomplish?
# 
df_raw[['latitude', 'longitude']].drop_duplicates().shape

dful = df_raw[['latitude', 'longitude']].drop_duplicates()
dful.head()

df_recs = dful.to_records(index=False)
df_recs[4]

ll_id_dict = {}

for i, rec in enumerate(df_recs):
    k = rec[0] * rec[1] # get unique ID by multiplying lat * long? NO! does not create unique values
    if ll_id_dict.get(k):
        continue
    else:
        ll_id_dict[k] = i

    
# next thing to try (3/26/2021): get unique lats and give each their own UID, then do same for longs,
# which will give you new UIDs for each coord dimension, then can just concantenate ints instead of long floats?
# test will always be that number of UIDs must match number of unique lat-long pairs
    
print(len(ll_id_dict.keys()))
print(len(df_recs))

86416
146064
