In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer

from itertools import chain
from IPython.display import clear_output

In [55]:
weather = pd.read_csv('../data/weather.csv')
weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])


In [62]:
test2 = pd.merge(weather[['Timestamp', 'Temperature', 'Distance', 'SiteId']], test, how = 'right', on = ['Timestamp', 'SiteId'])

In [63]:
test2 = test2.drop(labels=['Temperature_y', 'Distance_y', 'Unnamed: 0'], axis=1).rename(columns={'Temperature_x': 'Temperature', 'Distance_x': 'Distance'})

In [64]:
test2.head()

Unnamed: 0,Timestamp,Temperature,Distance,SiteId,obs_id,ForecastId,Value,Minutes,Hour,day,wday,month,Year,Date,off
0,2015-08-29,19.9,24.889929,1,1677832,1,0.0,0,0,29,5,8,2015,2015-08-29,True
1,2015-08-29,16.0,20.952256,1,1677832,1,0.0,0,0,29,5,8,2015,2015-08-29,True
2,2015-08-30,20.6,24.889929,1,5379616,1,0.0,0,0,30,6,8,2015,2015-08-30,True
3,2015-08-30,20.0,20.952256,1,5379616,1,0.0,0,0,30,6,8,2015,2015-08-30,True
4,2015-08-31,22.4,24.889929,1,496261,1,0.0,0,0,31,0,8,2015,2015-08-31,False


In [3]:
# Read in data
train = pd.read_csv('../data/train_weather.csv', index_col=0)
test = pd.read_csv('../data/test_weather.csv', index_col=0)

# Convert string times to timestamps
train['Timestamp'] = pd.to_datetime(train['Timestamp'])
test['Timestamp'] = pd.to_datetime(test['Timestamp'])

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Temperature,Distance,SiteId,obs_id,ForecastId,Value,Minutes,Hour,day,wday,month,Year,Date,off
0,0,2014-09-03 00:00:00,20.2,24.889929,1,744519,1,909655.5,0,0,3,2,9,2014,2014-09-03,False
1,1,2014-09-03 00:00:00,19.0,20.952256,1,744519,1,909655.5,0,0,3,2,9,2014,2014-09-03,False
2,2,2014-09-04 00:00:00,22.6,24.889929,1,7627564,1,1748273.0,0,0,4,3,9,2014,2014-09-04,False
3,3,2014-09-04 00:00:00,20.0,20.952256,1,7627564,1,1748273.0,0,0,4,3,9,2014,2014-09-04,False
4,4,2014-09-05 00:00:00,24.7,24.889929,1,7034705,1,,0,0,5,4,9,2014,2014-09-05,False


In [108]:
def process_site(site_id):

    # Extract training and testing data
    train_df = train[train['SiteId'] == site_id].sort_values(['Timestamp', 'Distance'])
    test_df = test[test['SiteId'] == site_id].sort_values(['Timestamp', 'Distance'])
    
    # Drop the duplicates according to furthest station
    train_df = train_df.drop_duplicates('Timestamp', keep = 'first')
    test_df  = test_df.drop_duplicates('Timestamp', keep = 'first')
    
    # Create median imputer
    median_imputer = Imputer(missing_values='NaN', strategy='median', axis = 0)
    # Train imputer on training data
    median_imputer.fit(train_df[['Temperature', 'Value']])
    
    # If all training temperatures are missing, drop temperatures from both training and testing
    # Still need to impute missing energy value
    if np.all(np.isnan(train_df['Temperature'])):
    
        train_df['Value'] = median_imputer.transform(train_df[['Temperature', 'Value']])
        train_df = train_df.drop(labels = 'Temperature', axis=1)
        test_df = test_df.drop(labels= 'Temperature', axis=1)
        
    else:
        # Transform training and testing data
        train_df[['Temperature', 'Value']] = median_imputer.transform(train_df[['Temperature', 'Value']])
        
        # If all testing temperatures are missing, use median of training temperatures
        if np.all(np.isnan(test_df['Temperature'])):
            test_df['Temperature'] = np.mean(train_df['Temperature'])
            
        # Otherwise, impute the missing temperatures
        else:
            test_df[['Temperature', 'Value']] = median_imputer.transform(test_df[['Temperature', 'Value']])
    
    
    # Find the minimum date for converting timestamp to float
    min_date = min(train[train['SiteId'] == site_id]['Timestamp'])

    # Extract labels 
    train_labels = train_df['Value']
    test_labels = test_df['Value']
    
    # Drop the unused columns
    train_df = train_df.drop(labels = ['Unnamed: 0', 'Distance', 'SiteId', 'obs_id', 'ForecastId', 'Value', 'Date'], axis = 1)
    test_df =   test_df.drop(labels = ['Unnamed: 0', 'Distance', 'SiteId', 'obs_id', 'ForecastId', 'Value', 'Date'], axis = 1)
    
    # Convert timestamp to numeric
    train_df['Timestamp'] = [(time.days*3600*24) for time in (train_df['Timestamp'] - min_date)] 
    test_df['Timestamp']  = [(time.days*3600*24) for time in (test_df['Timestamp'] - min_date)]
    
    return train_df, train_labels, test_df, test_labels

Problem Buildings to Look at are 11 and 124

In [111]:
train_e, train_labels_e, test_e, test_labels_e = process_site(11)

In [112]:
def predict_linear():
    model = LinearRegression()
    
    site_list = list(set(test['SiteId']))
    predictions = []
    
    number = len(site_list)
    count = 0
    
    # Iterate through every site
    for site in site_list:
        print(site)
        # Features and labels
        train_x, train_y, test_x, test_y = process_site(site)
        # Fitting the model
        model.fit(train_x, train_y)
        # Predict the values
        predicted_values = model.predict(test_x)
        predictions.append(predicted_values)
        count = count + 1
        
        # print('Percentage Complete: {:.1f}%.'.format(100 * count / number))
        
    # Flatten the list
    predictions = list(chain(*predictions))
    
    return predictions

In [None]:
linear_predictions = predict_linear()

1
2
3
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
25
26
27
29
32
33
34
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
57
58
59
60
61
62
63
64
65
66
67
68
69
70
72
73
74
75
76
77
78
83
84
85
86
87
88
89
90
92
93
94
96
98
99
100
101
102
105
106
107
108
109
110
111
112
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
134
135
136
139
140
141
142
143
145
146
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
167
169
170
171
172
173
174
175
176
177
178
180
181
182
183
184
185
186
189
190
191
192
193
194
195
196
197


In [None]:
def make_submission_file(predictions, name):
    submit_df = pd.read_csv('../data/submission_format.csv')
    submit_df['Value'] = predictions
    
    submit_df.to_csv('../submissions/%s' % name)

In [None]:
make_submission_file(linear_predictions, )