# 008 Data Preparation for OD Analysis
* Based on the DBSCAN results (density-based spatial clustering of applications with noise)
* This script includes data preparation for an origin-destination analysis to identify mobility patterns of evacuees during and after Hurricane Harvey

In [1]:
import pandas as pd
import numpy as np
from numpy import arange
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Check package versions
import types 
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

import pkg_resources
root_packages = [i.split('.', 1)[0] for i in list(imports())] 
for m in pkg_resources.working_set:
    if m.project_name.lower() in root_packages:
        print (m.project_name, m.version)

pandas 1.1.2
numpy 1.19.2
matplotlib 3.2.0


# 1. Load DBSCAN result data

In [3]:
df = pd.read_csv('../../outputs/clusters_top25_users_Harvey_activity.csv')

In [4]:
df_copy = df.iloc[1:].copy()
print (len(df_copy))

col_list = df_copy.columns.tolist()[1:]

col_list_x = [x for x in col_list if x[-2:]=='.1']
col_list_y = [x for x in col_list if x[-2:]!='.1']


# Houston bounding box
x1 = -96.04
x2 = -94.75
y1 = 29.4
y2 = 30.2

# Close Texas bounding box
x3 = -100.18
x4 = -95.02
y3 = 29.06
y4 = 33.85

for c in col_list:
    df_copy[c] = df_copy[c].astype(float)
    
df_copy.head(3)

173683


Unnamed: 0,id,home,home.1,2017-08-01,2017-08-01.1,2017-08-02,2017-08-02.1,2017-08-03,2017-08-03.1,2017-08-04,...,2017-10-23,2017-10-23.1,2017-10-24,2017-10-24.1,2017-10-25,2017-10-25.1,2017-10-26,2017-10-26.1,2017-10-27,2017-10-27.1
1,5F265B5D-F46D-4F1B-9549-F0E48A60E224,32.446467,-91.494068,,,,,,,,...,,,,,,,,,,
2,0B4899A7-3BE8-4AA7-BE67-3FD2054F8B02,29.968607,-95.209699,29.968607,-95.209699,29.968607,-95.209699,29.968607,-95.209699,29.968607,...,29.968607,-95.209699,29.968607,-95.209699,29.968607,-95.209699,,,,
3,8AF54FB4-BB03-4AFD-9997-D20A5C1637F0,36.588276,-94.755402,36.588276,-94.755402,36.588276,-94.755402,36.588276,-94.755402,36.588276,...,36.588276,-94.755402,36.588276,-94.755402,36.588276,-94.755402,36.588276,-94.755402,,


# 2. Create grid cells 
* Houston area: 2 decimal places
* Texas area: 1 decimal place
* Other states: 0 decimal place

In [None]:
for c in range(len(col_list_x)):
    df_copy['tmp'] = 'US'
    df_copy['tmp'][(df_copy[col_list_x[c]]<=x4)&(df_copy[col_list_x[c]]>=x3)&
                        (df_copy[col_list_y[c]]<=y4)&(df_copy[col_list_y[c]]>=y3)] = 'Texas'
    df_copy['tmp'][(df_copy[col_list_x[c]]<=x2)&(df_copy[col_list_x[c]]>=x1)&
                        (df_copy[col_list_y[c]]<=y2)&(df_copy[col_list_y[c]]>=y1)] = 'Houston'
    
    df_copy[col_list_x[c]][df_copy['tmp']=='Houston'] = df_copy[col_list_x[c]].round(2)
    df_copy[col_list_y[c]][df_copy['tmp']=='Houston'] = df_copy[col_list_y[c]].round(2)   
    df_copy[col_list_x[c]][df_copy['tmp']=='Texas'] = df_copy[col_list_x[c]].round(1)
    df_copy[col_list_y[c]][df_copy['tmp']=='Texas'] = df_copy[col_list_y[c]].round(1) 
    df_copy[col_list_x[c]][df_copy['tmp']=='US'] = df_copy[col_list_x[c]].round(0)
    df_copy[col_list_y[c]][df_copy['tmp']=='US'] = df_copy[col_list_y[c]].round(0)

# 3. Investigating data: pre-hurricane
* ~08/20

In [None]:
df_pre = df_copy[['id']+col_list_y[:21]+col_list_x[:21]]
df_pre = df_pre[((df_pre['home.1']<=x2)&(df_pre['home.1']>=x1)
                  &(df_pre['home']<=y2)&(df_pre['home']>=y1))
                 ] 

In [None]:
df_h1 = df_copy[['id','home']+col_list_y[25:32]+['home.1']+col_list_x[25:32]]
df_h1 = df_h1[((df_h1['home.1']<=x2)&(df_h1['home.1']>=x1)
                  &(df_h1['home']<=y2)&(df_h1['home']>=y1))
                 ] 

In [None]:
# Create id list
id_list_pre = df_pre['id'].unique().tolist()
id_list_h1 = df_h1['id'].unique().tolist()
print (len(id_list_pre))
print (len(id_list_h1))

In [None]:
# len(col_list_y[1:21])
# print (col_list_y[1:21])
# print (col_list_y[1])
# print (col_list_y[2])
# print (col_list_y[20])
# print (list(range(1,20)))

In [None]:
df_pre = df_pre.fillna(0)
df_h1 = df_h1.fillna(0)

In [None]:
# col_list_x[:21], col_list_y[:21]

x_list_pre = []
y_list_pre = []
for i in id_list_pre:
    x = df_pre[df_pre['id']==i][col_list_x[1]].values[0]
    y = df_pre[df_pre['id']==i][col_list_y[1]].values[0]
    for d in range(1,20):
        if df_pre[df_pre['id']==i][col_list_x[d+1]].values[0] == 0:
#             print (i, col_list_y[d+1])
            continue
        elif (df_pre[df_pre['id']==i][col_list_x[d+1]].values[0] != x) and \
        (df_pre[df_pre['id']==i][col_list_y[d+1]].values[0] != y):
            x = df_pre[df_pre['id']==i][col_list_x[d+1]].values[0]
            y = df_pre[df_pre['id']==i][col_list_y[d+1]].values[0]
        #print (x,y)
    x_list_pre.append(x)
    y_list_pre.append(y)
    
data_pre = pd.DataFrame()
data_pre['id'] = id_list_pre
data_pre['x_pre'] = x_list_pre
data_pre['y_pre'] = y_list_pre

In [None]:
data_pre.head(2)

In [None]:
# data_pre.to_csv('../../outputs/data_prep_network_pre.csv', index=False)

# 4. Hurricane period 1
* 08/25/2017 - 08/30/2017

In [None]:
df_h1 = df_copy[['id','home']+col_list_y[25:31]+['home.1']+col_list_x[25:31]]
df_h1 = df_h1[((df_h1['home.1']<=x2)&(df_h1['home.1']>=x1)
                  &(df_h1['home']<=y2)&(df_h1['home']>=y1))
                 ] 

In [None]:
df_h1.head()

In [None]:
id_list_h1 = df_h1['id'].unique().tolist()
print (len(id_list_h1))
df_h1 = df_h1.fillna(0)


# col_list_x[25:31], col_list_y[25:31]

x_list_h1 = []
y_list_h1 = []
for i in id_list_h1:
    x = df_h1[df_h1['id']==i][col_list_x[25]].values[0]
    y = df_h1[df_h1['id']==i][col_list_y[25]].values[0]
    for d in range(25,30):
        if df_h1[df_h1['id']==i][col_list_x[d+1]].values[0] == 0:
#             print (i, col_list_y[d+1])
            continue
        elif (df_h1[df_h1['id']==i][col_list_x[d+1]].values[0] != x) and \
        (df_h1[df_h1['id']==i][col_list_y[d+1]].values[0] != y):
            x = df_h1[df_h1['id']==i][col_list_x[d+1]].values[0]
            y = df_h1[df_h1['id']==i][col_list_y[d+1]].values[0]
        #print (x,y)
    x_list_h1.append(x)
    y_list_h1.append(y)
    
data_h1 = pd.DataFrame()
data_h1['id'] = id_list_h1
data_h1['x_h1'] = x_list_h1
data_h1['y_h1'] = y_list_h1

# data_h1.to_csv('../../outputs/data_prep_network_h1.csv', index=False)
data_h1.head(2)

# 5. Hurricane period 2
* 08/31/2017 - 09/04/2017

In [None]:
col_list_y[31:36]

In [None]:
df_h2 = df_copy[['id','home']+col_list_y[31:36]+['home.1']+col_list_x[31:36]]
df_h2 = df_h2[((df_h2['home.1']<=x2)&(df_h2['home.1']>=x1)
                  &(df_h2['home']<=y2)&(df_h2['home']>=y1))
                 ] 

In [None]:
# df_h2.head(2)

In [None]:
id_list_h2 = df_h2['id'].unique().tolist()
print (len(id_list_h2))
df_h2 = df_h2.fillna(0)


# col_list_x[31:36], col_list_y[31:36]

x_list_h2 = []
y_list_h2 = []
for i in id_list_h2:
    x = df_h2[df_h2['id']==i][col_list_x[31]].values[0]
    y = df_h2[df_h2['id']==i][col_list_y[31]].values[0]
    for d in range(31,35):
        if df_h2[df_h2['id']==i][col_list_x[d+1]].values[0] == 0:
#             print (i, col_list_y[d+1])
            continue
        elif (df_h2[df_h2['id']==i][col_list_x[d+1]].values[0] != x) and \
        (df_h2[df_h2['id']==i][col_list_y[d+1]].values[0] != y):
            x = df_h2[df_h2['id']==i][col_list_x[d+1]].values[0]
            y = df_h2[df_h2['id']==i][col_list_y[d+1]].values[0]
        #print (x,y)
    x_list_h2.append(x)
    y_list_h2.append(y)
    
data_h2 = pd.DataFrame()
data_h2['id'] = id_list_h2
data_h2['x_h2'] = x_list_h2
data_h2['y_h2'] = y_list_h2

# data_h2.to_csv('../../outputs/data_prep_network_h2.csv', index=False)

data_h2.head(2)

# 6. Post hurricane
* 09/05/2017 - 09/10/2017

In [None]:
print (col_list_y[36:42])

In [None]:
df_post = df_copy[['id','home']+col_list_y[36:42]+['home.1']+col_list_x[36:42]]
df_post = df_post[((df_post['home.1']<=x2)&(df_post['home.1']>=x1)
                  &(df_post['home']<=y2)&(df_post['home']>=y1))
                 ] 

# df_post.head(2)

In [None]:
id_list_post = df_post['id'].unique().tolist()
print (len(id_list_post))
df_post = df_post.fillna(0)


# col_list_x[31:36], col_list_y[31:36]

x_list_post = []
y_list_post = []
for i in id_list_post:
    x = df_post[df_post['id']==i][col_list_x[36]].values[0]
    y = df_post[df_post['id']==i][col_list_y[36]].values[0]
    for d in range(36,41):
        if df_post[df_post['id']==i][col_list_x[d+1]].values[0] == 0:
#             print (i, col_list_y[d+1])
            continue
        elif (df_post[df_post['id']==i][col_list_x[d+1]].values[0] != x) and \
        (df_post[df_post['id']==i][col_list_y[d+1]].values[0] != y):
            x = df_post[df_post['id']==i][col_list_x[d+1]].values[0]
            y = df_post[df_post['id']==i][col_list_y[d+1]].values[0]
        #print (x,y)
    x_list_post.append(x)
    y_list_post.append(y)
    
data_post = pd.DataFrame()
data_post['id'] = id_list_post
data_post['x_post'] = x_list_post
data_post['y_post'] = y_list_post

# data_post.to_csv('../../outputs/data_prep_network_post.csv', index=False)

data_post.head(2)