# Hopkins Statistics

In [15]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

The Hopkins statistic is a way of measuring the cluster tendency of a data set. It belongs to the family of sparse sampling tests.

In [2]:
# dataset = pd.read_csv('./Absenteeism_at_work.csv', sep=';')
dataset = pd.read_pickle('./state_clubbed_df.pickle')
sampling_size = 350

In [3]:
rows, columns = dataset.shape
print ("Total rows : {}\t Total columns: {}".format(rows, columns))

Total rows : 527	 Total columns: 13


In [4]:
dataset.head()

Unnamed: 0,state,year,murder,rape,foeticide,kidnapping and abduction,abetment of suicide,exposure and abandonment,procuration of minor girls,selling girls for prostitution,prohibition of child marriage act,other crimes,total
0,andhra pradesh,2001,35.0,84.0,0.0,57.0,7.0,22.0,12.0,0.0,6.0,47.0,270.0
1,arunachal pradesh,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,assam,2001,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
3,bihar,2001,2.0,16.0,0.0,26.0,0.0,1.0,16.0,1.0,2.0,18.0,83.0
4,chhattisgarh,2001,14.0,150.0,5.0,46.0,1.0,15.0,0.0,0.0,0.0,354.0,585.0


In [5]:
# Scaling and Preprocessing the dataset

dataset.drop(['state', 'year', 'total'], axis=1, inplace=True)
dataset = pd.DataFrame(scale(dataset))

In [6]:
# Sample n observations from D : P

if sampling_size > dataset.shape[0]:
    raise Exception(
        'The number of sample of sample is bigger than the shape of D')

sample_dataset = dataset.sample(n=sampling_size)
sample_dataset.shape

(350, 10)

In [7]:
sample_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
423,0.720329,-0.287126,-0.288881,0.860949,-0.281905,-0.417052,1.899639,0.089764,-0.32052,-0.357693
520,-0.505852,-0.474831,-0.422895,-0.362013,-0.281905,-0.398377,-0.216594,-0.183679,-0.32052,-0.324346
510,-0.272878,2.734926,-0.422895,0.654999,-0.281905,-0.417052,-0.139839,-0.183679,-0.32052,-0.17012
382,-0.039903,0.504364,0.247176,3.353584,-0.281905,0.834176,-0.183699,-0.001384,-0.442216,0.16751
110,-0.49359,-0.493601,-0.422895,-0.382184,-0.281905,-0.417052,-0.205629,-0.183679,-0.442216,-0.345188


In [8]:
# Get the distance to their neirest neighbors in D : X

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(sample_dataset, k=2)
sample_knn_dist = dist[:, 1]

In [9]:
sample_knn_dist

array([7.76514413e-01, 7.87533554e-02, 1.21282462e+00, 5.45582309e-01,
       3.55557130e-02, 2.27328781e-01, 5.62756423e-03, 2.14411993e-02,
       6.67646030e-02, 0.00000000e+00, 1.01688051e+00, 2.93261373e+00,
       2.33893572e-03, 2.12319822e-03, 1.06265084e+00, 6.08961683e-01,
       4.63737317e-02, 3.58311387e+00, 2.15187188e-01, 5.85387663e-01,
       3.74776685e+00, 9.55439198e-03, 3.81439469e+00, 1.60623702e-01,
       1.08019625e+00, 8.83819398e-01, 1.58923651e+00, 8.96879124e-02,
       1.51671996e-01, 5.62756423e-03, 1.76785313e-01, 3.74827298e-02,
       5.45505517e-02, 3.45587921e-02, 0.00000000e+00, 4.15798983e-02,
       5.72216674e-01, 3.67695149e+00, 1.11125774e-02, 0.00000000e+00,
       1.04272414e-01, 2.54348827e+00, 1.43409038e-02, 1.67551443e-02,
       1.95858289e+00, 3.98495585e-01, 6.85298391e-01, 5.59884771e-01,
       1.09576242e-01, 1.39738131e-02, 1.62311660e-02, 7.21742139e-02,
       1.41792324e+00, 6.09851746e-02, 3.12841782e-03, 0.00000000e+00,
      

In [10]:
# Randomly simulate n points with the same variation as in D : Q.

max_data = dataset.max()
min_data = dataset.min()

uniform_sel_x = np.random.uniform(min_data[0], max_data[0], sampling_size)
uniform_sel_y = np.random.uniform(min_data[1], max_data[1], sampling_size)

uniform_obsv = np.column_stack((uniform_sel_x, uniform_sel_y))
if len(max_data) >= 2:
    for i in range(2, len(max_data)):
        temp = np.random.uniform(min_data[i], max_data[i], sampling_size)
        to_stack = (uniform_obsv, temp)
        uniform_obsv = np.column_stack(to_stack)

uniform_obsv_df = pd.DataFrame(uniform_obsv)
uniform_obsv_df.columns = dataset.columns

In [11]:
uniform_obsv_df.shape

(350, 10)

In [12]:
uniform_obsv_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,6.025616,4.684547,1.082696,5.489314,15.654607,3.141435,0.455805,3.547502,2.309592,4.107076
1,4.451753,6.327531,5.295906,0.549737,2.939737,2.237436,9.896995,6.807954,0.141895,4.173665
2,2.989599,0.298279,4.810684,1.636181,6.908516,2.966611,2.373934,7.395206,5.450528,1.533384
3,4.418234,3.081694,2.844432,4.865856,12.588519,2.141714,9.953272,9.463286,3.880438,2.827955
4,5.120453,4.607725,0.7881,6.10218,15.47815,3.470753,6.883838,6.653271,6.882474,6.740676


In [13]:
# Get the distance to their neirest neighbors in D : Y

tree = BallTree(dataset, leaf_size=2)
dist, _ = tree.query(uniform_obsv_df, k=1)
uniform_knn_dist = dist

In [14]:
# Calculate the Hopkins Score

x = sum(sample_knn_dist)
y = sum(uniform_knn_dist)

if x + y == 0:
    raise Exception('The denominator of the hopkins statistics is null')

h_stat = (y[0] / (x + y)[0])
print ("The hopkins statistics measure is {}".format(h_stat))

The hopkins statistics measure is 0.9561462031392631
