# Chicago Data Blend

In [6]:
#Import packages
import os
import pandas as pd 
import numpy as np
import datetime as dt
import csv
import re

# Bike Data

In [7]:
#Rename Chicago Files
# [os.rename(f, f.replace('2', 'Chicago_')) for f in os.listdir('.') if not f.startswith('.')]

**Function for getting consistly named columns between files**

In [8]:
#File reader/Column parser
def read_my_csv(file_handle, column_map):
    # reverse the column mapping dict to use for synonym lookup
    synoms = dict(sum([
        [(syn, k) for syn in v] for k, v in column_map.items()], []))

    # build csv reader
    reader = csv.reader(file_handle)

    # get the header, and map columns to desired names
    header = next(reader)
    header = [synoms.get(c, c) for c in header]

    # yield the header
    yield header

    # yield the remaining rows
    for row in reader:
        yield row

In [None]:
#Create synonms for read_my_csv
column_map = {
    'Bike_Num': ('Bike#', 'Bike number', 'Bike #'),
    'Subscription': ('Subscriber Type', 'Subscription Type', 'Subscription type', 'Account type', 'Member Type', 'Member type', 'Membership_Type'),
    'Total_Duration': ('Duration', 'Duration (ms)', 'Total duration (ms)')
#     'Start_Station': ('Start Station', 'Start station', 'Start station number'),
#     'End_Station': ('End Station', 'End station', 'End station number')
}

#Loop for reading files and running parser
filenames = [file for file in os.listdir('.') if file.startswith('Chicago_2')]

dfs = []

for file in filenames:
    with open(file, 'rU') as f:
        generator = read_my_csv(f, column_map)
        columns = next(generator)
        dfs.append(pd.DataFrame(generator, columns = columns))
chicago_df = pd.concat(dfs, ignore_index = True)

#Drop undeed columns
to_drop = ['End Station', 'End station', 'End station number', 'Start Station', 'Start station', 'Start station number', 'Total_Duration', 'Bike_Num']

for col in list(chicago_df):
    if col in to_drop:
        chicago_df = chicago_df.drop(col, 1)
print(chicago_df.shape)

# Create Subscription dummy
subsc_dummy = pd.get_dummies(chicago_df['Subscription'])
chicago_df = pd.concat([chicago_df, subsc_dummy], axis = 1)
chicago_df = chicago_df.drop(['Subscription'], axis = 1)

#Convert Start date and End date to pandas.DateTime (**Long Process**)
chicago_df['Start date']= pd.to_datetime(chicago_df['Start date'])
chicago_df['End date'] = pd.to_datetime(chicago_df['End date'])

#Calculate Duration
chicago_df['Duration'] = chicago_df['End date'] - chicago_df['Start date']
time = pd.DatetimeIndex(chicago_df.Duration)
chicago_df.Duration = time.hour * 60 + time.minute

#Set index to Start date for grouping
chicago_time_sort_df = chicago_df.reset_index().set_index('Start date')
chicago_time_sort_df = chicago_time_sort_df.drop(chicago_time_sort_df.columns[0], axis = 1) #Drop index column
chicago_time_sort_df['Total_Count'] = float(1) #Add counter for total count

#Create stats dict for column stats
f = {'Duration': ('mean', 'min', 'max'), 'Casual': 'sum', 'Member': 'sum', 'Registered': 'sum', 'Subscriber': 'sum', 'Total_Count': 'sum'}

#Aggregate data by day
chicago_by_day = chicago_time_sort_df.groupby(pd.TimeGrouper("D")).agg(f) #Groupby day and apply agg(f)
chicago_by_day_df = pd.DataFrame(chicago_by_day #Create aggregated DF
chicago_by_day_df = chicago_by_day_df.ix[4:] #Drop first 4 days to match bike data size

print(chicago_by_day_df.shape)
chicago_by_day_df.head()

# Weather Data

In [None]:
#Import weather data
weather_df = pd.read_csv('DC_Weather.csv')
weather_df = weather_df.fillna('')
print(weather_df.shape)
weather_df.head()

#Drop STATION and STATION_NAME
weather_df = weather_df.drop(['STATION', 'STATION_NAME'], axis = 1)

#Replace 'T' with 0 fpr precip
weather_df.HOURLYPrecip.replace('T', 0, inplace = True)

#Convert DATE to DateTime
weather_df.DATE= pd.to_datetime(weather_df.DATE)

#Index DATE and convert columns to_numeric for aggregation
weather_time_sort_df = weather_df.reset_index().set_index('DATE')
weather_time_sort_df = weather_time_sort_df.drop(weather_time_sort_df.columns[0], axis = 1)
weather_time_sort_df = weather_time_sort_df.convert_objects(convert_numeric = True)
weather_time_sort_df = weather_time_sort_df.drop(['REPORTTPYE', 'HOURLYSKYCONDITIONS', 'HOURLYPRSENTWEATHERTYPE'], axis = 1)

#Create stats dict for column stats
f = {}
stats = ['sum', 'mean', 'max', 'min']

for col in list(weather_time_sort_df):
    f[col] = stats

weather_by_day = weather_time_sort_df.groupby(pd.TimeGrouper("D")).agg(f)
weather_by_day_df = pd.DataFrame(weather_by_day)

print(weather_by_day_df.shape)
weather_by_day_df.head()

# Final Dataset

In [None]:
#Merge Datasets
merged = pd.concat([chicago_by_day_df, weather_by_day_df], join = 'inner', axis = 1)
merged.to_csv('Chicago_Blended(new).csv') #Create .csv file

print(merged.shape)
merged.head()