In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv('HealthViz County Dataset 6 21 17.csv', skiprows=0, header=1, index_col=0, encoding = "ISO-8859-1")
df.index.name=None
#data = pd.read_excel('HealthViz County Dataset 6.19.17.xlsx',skiprows=0, header=1, index_col=0)
#data.index.name=None
#data.drop(data.columns[[0,1]], axis=1, inplace=True)

df.dropna(axis=0, how='any', inplace=True)
df.drop(df.columns[[0]], axis=1, inplace=True) #drop 'County'

In [15]:
### Direct Distance Computation ###
# Step 1: get log values for $ related variabes, normalize/whiten the dataframe, determine the target place (represented in GEOID, int)
# Step 2: Get the Euclidean distance of values between each place and the target one for each row in dataframe
# Step 3: Sort the dataframe based on the arising order of the distance
# Step 4: Get the list of top N similar places

from scipy.cluster.vq import whiten

# Used only for this case, assuming the dataframe passed to the distance/clustering has no null values
def drop_null(df):
    '''
    Drop the columns and rows with any null value
    '''
    
    #df.dropna(axis=1, how='any', inplace=True)
    df.dropna(axis=0, how='any', inplace=True)
    return df
    
def get_log(df, l):
    '''
    Get the log values of certain dollar related variables, with the input of a list of variable names
        l: a list of strings (variable name)
    '''
    for i in l:
        data = df[[i]]
        array = np.log(data.values)
        data = pd.DataFrame(array, columns=data.columns, index=data.index)
        df[i] = data[i]
    return df
       
def normalize_df(df):
    '''
    Normalize/whiten the dataframe
    '''
    df = pd.DataFrame(whiten(df), columns=df.columns, index=df.index)
    return df

# def pre_prpcess_df
# dealing with null values, get log values and do the normalization

def get_distance(df, GEOID_target, GEOID):
    '''
    Get the Euclidean distance between two sets of values
        GEOID_target, GEOID: index, int
    '''
    a = df.loc[[GEOID_target]].values # df.values
    b = df.loc[[GEOID]].values
    dist = np.linalg.norm(a-b)
    return dist

def  get_top_n_similar(df, l, GEOID_target, n):
    '''
    Get the N places which are similar to the target place
        data: dataframe
        l: a list of string(s), for get_log function
        GEOID_target: index, int
        n: int
    '''
    #df = drop_null(data)
    df = get_log(df, l)
    df = normalize_df(df)
    df['distance'] = df.apply(lambda x: get_distance(df, GEOID_target, x.name), axis=1)
    df = df.sort_values('distance', ascending=True)
    index_list = df.head(n+1).index.tolist()[1:]
    return index_list

In [16]:
get_top_n_similar(df, ['Median household income, 2011-2015'], 1003, 3)

[40147, 51165, 31185]