# Imports

In [None]:
# Setup TensorBoard for monitoring
# https://colab.research.google.com/github/tensorflow/tensorboard/blob/master/docs/tensorboard_in_notebooks.ipynb#scrollTo=hzm9DNVILxJe

%load_ext tensorboard

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from pprint import pprint
from google.colab import drive


drive.mount('/content/drive') 

BASE_PATH = '/content/drive/Shareddrives/Geospatial Hackathon 2021/hackathon'
DATA_PATH = BASE_PATH + '/data'

# Paths to input data
IMAGE_PATH = DATA_PATH + '/20210417144750'
IMAGE_GLOB_PATH = IMAGE_PATH + '/*.tif'
METADATA_PATH = DATA_PATH + '/output3.csv'

# Paths for output data
TENSORBOARD_PATH = BASE_PATH + '/logs'
MODEL_PATH = BASE_PATH + '/models'

# Max size of the training dataset to use (None to use all available data)
TRAINING_DATA_SIZE = 2000

# Size of the images being used
TARGET_SIZE = (500, 500)

# Batch size for training
BATCH_SIZE = 10

pprint(os.listdir(BASE_PATH))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['data',
 'tensorflow',
 'Report draft.gdoc',
 'datamapping.csv',
 'models',
 'logs',
 'report template.gdoc',
 'modelsstd_scaler.bin',
 'std_scaler.bin']


# Data Preprocessing

In [None]:
dataset = pd.read_csv(DATA_PATH + '/output3.csv')
dataset.head
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.externals.joblib import dump, load

# Target
# image, crash_severity, holiday, light, weather, speed_limit
crash_data = dataset.iloc[:,[0,5,6,7]]
y = dataset.iloc[:,[3]]

# One hot encode categorical features
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
crash_data = ct.fit_transform(crash_data)
print(f'\n----- Encoded Crash Data\n{crash_data}')

# Get encoding <-> category mapping
encoder = ct.named_transformers_['encoder']
categories = encoder.categories_[-1].reshape(-1, 1)
encoding = encoder.transform(categories).toarray()
mapping = {elem[-1]: elem[:-1].astype('float32') for elem in np.concatenate([encoding, categories], 1)}
print(f'\n----- Mapping')
pprint(mapping)

# Spliting into train and test set
print(f'\n----- data spliting')
X_train, X_test, y_train, y_test = train_test_split(crash_data,y, test_size = 0.2, random_state=1)
print(f'X_train shape = {X_train.shape}')
print(f'y_train shape = {y_train.shape}')
print(f'sample row: {X_train[0]}')
print(f'X_test shape = {X_test.shape}')
print(f'y_test shape = {y_test.shape}')
print(f'sample row: {X_test[0]}')


sc = StandardScaler()
print(f'\n----- Standard scaling')
temp = (pd.DataFrame(X_train))
temp = temp.iloc[:,[0,1,2,4,5]].values
sc.fit_transform(temp)



----- Encoded Crash Data
[[1.0 0.0 0.0 '-36.853705673215856_174.72299370094953' 1.0 50]
 [1.0 0.0 0.0 '-36.853705673215856_174.72299370094953' 0.0 50]
 [1.0 0.0 0.0 '-36.853705673215856_174.72299370094953' 0.0 50]
 ...
 [0.0 0.0 1.0 '-36.9559938077771_174.8661960210974' 0.66 50]
 [0.0 0.0 1.0 '-36.9559938077771_174.8661960210974' 0.66 60]
 [0.0 0.0 1.0 '-36.9559938077771_174.8661960210974' 0.0 50]]

----- Mapping
{'F': array([1., 0., 0.], dtype=float32),
 'HR': array([0., 1., 0.], dtype=float32),
 'LR': array([0., 0., 1.], dtype=float32)}

----- data spliting
X_train shape = (62608, 6)
y_train shape = (62608, 1)
sample row: [0.0 0.0 1.0 '-36.8741633001281_174.75572565984046' 0.66 50]
X_test shape = (15653, 6)
y_test shape = (15653, 1)
sample row: [1.0 0.0 0.0 '-36.861888723980755_174.7639086495632' 0.0 100]

----- Standard scaling


array([[-1.99468479, -0.20244913,  2.27874538,  0.13034556, -0.51862695],
       [ 0.50133234, -0.20244913, -0.43883797,  0.9714686 , -0.51862695],
       [-1.99468479,  4.93951242, -0.43883797, -1.5024227 , -0.51862695],
       ...,
       [ 0.50133234, -0.20244913, -0.43883797,  0.9714686 , -0.51862695],
       [ 0.50133234, -0.20244913, -0.43883797, -0.68603857, -0.51862695],
       [-1.99468479, -0.20244913,  2.27874538,  0.13034556,  2.01535537]])

# Saving everything

In [None]:
pd.DataFrame(X_train).to_csv(DATA_PATH + "/train/X_train.csv", index=False)
pd.DataFrame(y_train).to_csv(DATA_PATH + "/train/y_train.csv", index=False)
pd.DataFrame(X_test).to_csv(DATA_PATH + "/test/X_test.csv", index=False)
pd.DataFrame(y_test).to_csv(DATA_PATH + "/test/y_test.csv", index=False)
pd.DataFrame(mapping).to_csv(DATA_PATH + "mapping.csv", index=False)
dump(sc, BASE_PATH + '/std_scaler.bin', compress=True)

['/content/drive/Shareddrives/Geospatial Hackathon 2021/hackathon/std_scaler.bin']