# Data Preprocess

Contents:
- Read input files
- Remove non relevant columns
- Remove NaNs
- Translate column names to English
- Store data in feather format

## Read input files

In [None]:
input_folder = '../data/zhbikes'
output_file = '../data/preprocessed/zhbikes.feather'

In [None]:
import numpy as np
import pandas as pd

from glob import glob
import os

In [None]:
input_files = glob('{}/*.csv'.format(input_folder))

In [None]:
dataframes = [pd.read_csv(file)[['fk_zaehler','datum','velo_in','velo_out','fuss_in','fuss_out','objectid']] for file in input_files]

In [None]:
df = pd.concat(dataframes)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

## Remove non relevant columns

In [None]:
df.drop(['fuss_in','fuss_out'], axis=1, inplace=True)

## Remove NaNs

In [None]:
df.dropna(inplace=True)

## Translate column names to English

In [None]:
df.columns = ['counting_station','datetime','velo_in', 'velo_out', 'objectid']

## Store data in feather format

In [None]:
output_folder = os.path.dirname(output_file)

In [None]:
os.makedirs(output_folder, exist_ok=True)

In [None]:
df.reset_index(drop=True).to_feather(output_file)