## Feature Engineering

We are using a temporary dataframe here until the cleansing process is done. 

See add_sunrise_sunset_data and add_weather_data 


In [1]:
import os
import numpy as np
import pandas as pd
import datetime
import requests
import time

In [3]:
def create_formatted_data():
    formatted_arr = []
    for file in os.listdir('data'):
        if not file.endswith('.ipynb_checkpoints'):
            file_path = 'data/' + file
            date = file.split('.bac')[0][-10:]
            with open(file_path) as fp:
                for line in fp.readlines():
                    instance = [date]
                    whole_time = line[line.find("[")+1:line.find("]")]
                    hour_min_sec = whole_time.split(':')
                    for x in hour_min_sec:
                        instance.append(x)
                    rest = line.split("]:")
                    features = rest[1].split("|")
                    for x in features:
                        instance.append(x)
                    if instance[8] != "GeneralMessage":
                        formatted_arr.append(instance)
    
    df = pd.DataFrame(formatted_arr)
    return df

dataframe = create_formatted_data()
dataframe.columns=['Date', 'Hour', 'Minute', 'Second', 'LogVisibility', 'LogSeverity', 'entryType', 'entrySubType', 'eventType', 'eventType_information1', 'eventType_information2', 'eventType_information3', 'eventType_information4', 'eventType_information5', 'eventType_information6', 'eventType_information7']
dataframe.Second = pd.to_numeric(dataframe.Second)
dataframe.Hour = pd.to_numeric(dataframe.Hour)
dataframe.Minute = pd.to_numeric(dataframe.Minute)

test_df = dataframe[:1000]

In [5]:
def create_weather_dict(df):
    base_http = 'https://api.darksky.net/forecast/a6861bbd8dfb09d9f06714ba8485934f/39.833851,-74.871826,'

    temp_time_dict = {}
    unix_timestamp = []
    for index, row in df.iterrows():
        new_date = row['Date']
        d_arr = new_date.split('-')
        date_obj = datetime.date(int(d_arr[0]), int(d_arr[1]), int(d_arr[2]))
        timestamp_new = int(time.mktime(date_obj.timetuple())) + 4 * 60 * 60
        unix_timestamp.append(timestamp_new)
        if not new_date in temp_time_dict.keys():
            r = requests.get(base_http + str(timestamp_new))
            temp_json = r.json()
            temp_time_dict[new_date] = temp_json
    return temp_time_dict, unix_timestamp

In [6]:
temp_dict, unix_timestamp = create_weather_dict(test_df)

In [7]:
def add_weather_data(df, temperature_dict):
    temperature_arr = []
    for index, row in df.iterrows():
        date = row['Date']
        hour = row['Hour']
        temperature_info = temperature_dict[date]['hourly']['data'][hour]['temperature']
        temperature_arr.append(temperature_info)
    return temperature_arr

In [8]:
temp_ar = add_weather_data(test_df, temp_dict)
test_df['OutsideTemp'] = pd.Series(temp_ar, index=test_df.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
def add_sunrise_sunset_data(df, temperature_dict, unix_timestamp):
    sun_arr = []
    for index, row in df.iterrows():
        current_timestamp = unix_timestamp[index]
        date = row['Date']
        hour = row['Hour']
        minute = row['Minute']
        sunrise = temperature_dict[date]['daily']['data'][0]['sunriseTime'] #in GMT 
        sunset = temperature_dict[date]['daily']['data'][0]['sunsetTime']
        
        current_timestamp = current_timestamp + (hour * 60 * 60) + (minute * 60)
        
        is_sun_up = 1 if sunrise <= current_timestamp <= sunset else 0
        sun_arr.append(is_sun_up)
        
    return sun_arr

In [10]:
is_sun_up_arr = add_sunrise_sunset_data(test_df, temp_dict, unix_timestamp)
test_df['Sun'] = pd.Series(is_sun_up_arr, index=test_df.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
test_df[['OutsideTemp', 'Sun']]

Unnamed: 0,OutsideTemp,Sun
0,71.96,0
1,71.96,0
2,71.96,0
3,71.96,0
4,71.96,0
5,71.96,0
6,71.96,0
7,71.96,0
8,71.96,0
9,71.96,0
