In [79]:
import pandas as pd
import numpy as np

In [80]:
df = pd.read_csv('HealthViz-county-071017.csv', skiprows=0, header=1, index_col=0)
df.index.name=None
df.rename(columns={'Unnamed: 1': 'County'}, inplace=True)
df_no_null = df.copy()

In [81]:
df_no_null.dropna(axis=0, how='any',inplace=True)
df_no_null.drop(df_no_null.columns[[0]], axis=1,inplace=True) #drop 'County'
#df.drop(df.columns[[0]], axis=1,inplace=True) #drop 'County'

In [82]:
### Direct Distance Computation ###
# Step 1: get log values for $ related variabes, normalize/whiten the dataframe, determine the target place (represented in GEOID, int)
# Step 2: Get the Euclidean distance of values between each place and the target one for each row in dataframe
# Step 3: Sort the dataframe based on the arising order of the distance
# Step 4: Get the list of top N similar places

#log cann ignore null when dealing with data
#modified whiten function can ignore null as well


def get_log(df, l):
    '''
    Get the log values of certain dollar related variables, with the input of a list of variable names
        l: a list of strings (variable name)
    '''
    for i in l:
        data = df[[i]]
        array = np.log(data.values)
        data = pd.DataFrame(array, columns=data.columns, index=data.index)
        df[i] = data[i]
    return df


def whiten(obs):
    '''
    Modified whiten function from https://github.com/scipy/scipy/blob/v0.19.0/scipy/cluster/vq.py#L95-L148
    Able to deal with dataset with null values using np.nanstd
    '''
    std_dev = np.nanstd(obs, axis=0)
    zero_std_mask = std_dev == 0
    if zero_std_mask.any():
        std_dev[zero_std_mask] = 1.0
        warnings.warn("Some columns have standard deviation zero. "
                      "The values of these columns will not change.",
                      RuntimeWarning)
    return obs / std_dev


def normalize_df(df):
    '''
    Normalize/whiten the dataframe
    '''
    df = pd.DataFrame(whiten(df), columns=df.columns, index=df.index)
    return df

# def pre_prpcess_df
# (dealing with null values, )get log values and do the normalization

def get_distance(df, GEOID_target, GEOID):
    '''
    Get the Euclidean distance between two sets of values. 
    Distance has been scaled up.
        GEOID_target, GEOID: index, int
    '''
    a = df.loc[[GEOID_target]].values # df.values
    b = df.loc[[GEOID]].values
    data = np.append(a,b,axis=0)
    mask = np.isfinite(data)
    curr = np.logical_and(mask[0], mask[1])
    u = data[0][curr]
    v = data[1][curr]
    weight = (len(curr)/np.count_nonzero(curr))**(0.5)
    dist = (np.linalg.norm(u - v))*weight
    return dist

def check_target_null(df, GEOID_target):
    '''
    Check whether there is NaN value of the target place. if there is, a warning message should be shown to the users
    And the related column(s) will be dropped as well
        GEOID_target: index, int
    '''
    target = df.loc[[GEOID_target]]
    target_name = target.iloc[0]['County']
    print ('The name of the target place is', target_name)
    l = target.columns[target.isnull().any()].tolist()
    if len(l) != 0:
        print ('[NOTE]: The target place has NaN values for', l, ',which will not be considered in the computation.')
        for i in l:
            df.drop(i, axis=1, inplace=True)
    return df
    

def  get_top_n_similar(data, l, GEOID_target, n): ### 'County' column dropped inside ###
    '''
    Get the N places which are similar to the target place
        data: dataframe
        l: a list of string(s), for get_log function
        GEOID_target: index, int
        n: int
    '''
    data = check_target_null(data, GEOID_target)
    df = data.drop(data.columns[[0]], axis=1).copy() ### drop 'County' column ###
    df = get_log(df, l)
    df = normalize_df(df)
    df['distance'] = df.apply(lambda x: get_distance(df, GEOID_target, x.name), axis=1)
    df = df.sort_values('distance', ascending=True)
    index_list = df.head(n+1).index.tolist()[1:]
    county_list = data.ix[index_list]['County'].values.tolist()
    return county_list

In [83]:
GEOID_target = 1013
n = 10
get_top_n_similar(df, ['Median household income, 2011-2015'], GEOID_target, n)

The name of the target place is Butler County, AL
[NOTE]: The target place has NaN values for ['Teen birth rate, Females, Juveniles (5-17 years) (Births per 1,000 women), 2011-2015'] ,which will not be considered in the computation.


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


['Choctaw County, AL',
 'Chambers County, AL',
 'Chester County, SC',
 'Covington city, VA',
 'Winston County, MS',
 'Danville city, VA',
 'Lenoir County, NC',
 'Warren County, GA',
 'Halifax County, NC',
 'Gogebic County, MI']