## Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import LSTM, Dense
from keras.models import Sequential
from PIL import Image
import matplotlib.pyplot as plt

## Mounting Google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Reading training data

In [3]:
df = pd.read_csv('/content/drive/My Drive/Shell_AI_train/train.csv')
print(df.head())

  DATE (MM/DD)    MST  ...  Moisture  Albedo (CMP11)
0          1/1  00:00  ...       0.0             0.0
1          1/1  00:01  ...       0.0             0.0
2          1/1  00:02  ...       0.0             0.0
3          1/1  00:03  ...       0.0             0.0
4          1/1  00:04  ...       0.0             0.0

[5 rows x 17 columns]


### Checking value counts for each date

In [4]:
print(df['DATE (MM/DD)'].value_counts())

6/29     1440
4/13     1440
7/14     1440
9/29     1440
8/2      1440
         ... 
3/16     1440
2/24     1440
12/28    1440
2/26     1440
12/5     1440
Name: DATE (MM/DD), Length: 366, dtype: int64


### Checking value counts for each time of day

In [5]:
print(df['MST'].value_counts())

10:24    366
23:36    366
06:22    366
18:18    366
01:56    366
        ... 
15:05    366
04:56    366
07:46    366
00:43    366
22:30    366
Name: MST, Length: 1440, dtype: int64


### Code to select row of specific date and time

In [6]:
selected_row = df[df['DATE (MM/DD)'] == '1/1']
selected_row = selected_row[selected_row['MST'] == '07:40']
print(selected_row)

    DATE (MM/DD)    MST  ...  Moisture  Albedo (CMP11)
460          1/1  07:40  ...       0.0          0.1883

[1 rows x 17 columns]


### List all available features

In [7]:
print(df.columns)

Index(['DATE (MM/DD)', 'MST', 'Global CMP22 (vent/cor) [W/m^2]',
       'Direct sNIP [W/m^2]', 'Azimuth Angle [degrees]',
       'Tower Dry Bulb Temp [deg C]', 'Tower Wet Bulb Temp [deg C]',
       'Tower Dew Point Temp [deg C]', 'Tower RH [%]', 'Total Cloud Cover [%]',
       'Peak Wind Speed @ 6ft [m/s]', 'Avg Wind Direction @ 6ft [deg from N]',
       'Station Pressure [mBar]', 'Precipitation (Accumulated) [mm]',
       'Snow Depth [cm]', 'Moisture', 'Albedo (CMP11)'],
      dtype='object')


### Replacing -1 in target value to 0

In [8]:
df['Total Cloud Cover [%]'].replace([-1],[0],inplace=True)

### Check for null values

In [9]:
df.isnull().sum() 

DATE (MM/DD)                             0
MST                                      0
Global CMP22 (vent/cor) [W/m^2]          0
Direct sNIP [W/m^2]                      0
Azimuth Angle [degrees]                  0
Tower Dry Bulb Temp [deg C]              0
Tower Wet Bulb Temp [deg C]              0
Tower Dew Point Temp [deg C]             0
Tower RH [%]                             0
Total Cloud Cover [%]                    0
Peak Wind Speed @ 6ft [m/s]              0
Avg Wind Direction @ 6ft [deg from N]    0
Station Pressure [mBar]                  0
Precipitation (Accumulated) [mm]         0
Snow Depth [cm]                          0
Moisture                                 0
Albedo (CMP11)                           0
dtype: int64

## Training process

### Select features for training

In [128]:
selected_features = ['Global CMP22 (vent/cor) [W/m^2]',
       'Direct sNIP [W/m^2]', 'Azimuth Angle [degrees]',
       'Tower Dry Bulb Temp [deg C]', 'Tower Wet Bulb Temp [deg C]',
       'Tower Dew Point Temp [deg C]', 'Tower RH [%]', 'Total Cloud Cover [%]',
       'Peak Wind Speed @ 6ft [m/s]', 'Avg Wind Direction @ 6ft [deg from N]',
       'Station Pressure [mBar]', 'Precipitation (Accumulated) [mm]',
       'Snow Depth [cm]', 'Moisture', 'Albedo (CMP11)']

### Function to traverse data and get input and output sequences

In [129]:
def obtain_train_data(df, selected_features):
  row_count = 0
  input_data_points = []
  output_data_points = []
  for idx, row in df.iterrows():
    cur_data_point = []
    if row_count % 10 == 0:
      for feature in selected_features:
          cur_data_point.append(float(row[feature]))
      input_data_points.append(cur_data_point)
      output_data_points.append(row['Total Cloud Cover [%]'])
    row_count += 1
    if row_count % 10000 == 0:
      print(row_count)
    if row_count == 500000:
      break
  return np.array(input_data_points), np.array(output_data_points)

input_seq, output_seq = obtain_train_data(df, selected_features)
print(len(input_seq), len(output_seq))

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
50000 50000


### Function to split sequence to predict next steps

In [79]:
def split_sequence(input_sequence, output_sequence, n_steps):
	X, y = list(), list()
	for i in range(len(input_sequence)):
		# find the end of this pattern
		end_ix = i + n_steps + n_steps
		# check if we are beyond the sequence
		if end_ix > len(input_sequence)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = input_sequence[i:end_ix - n_steps], output_sequence[end_ix -n_steps : end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [130]:
n_steps = 12
n_input_features = len(selected_features)
X, y = split_sequence(input_seq, output_seq, n_steps)

### Defining LSTM architecture

In [131]:
# define model
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_input_features)))
lstm_model.add(Dense(n_steps))
lstm_model.compile(optimizer='adam', loss='mse')



In [132]:
print(X.shape)

(49976, 12, 15)


In [133]:
print(y.shape)

(49976, 12)


In [134]:
X = X.reshape((X.shape[0], X.shape[1], n_input_features))

### Training and saving LSTM model

In [141]:
# fit model
lstm_model.fit(X, y, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fa6bceb7090>

In [142]:
lstm_model.save('/content/drive/My Drive/Shell_AI_train/lstm_model_noimage2.h5')

## Testing process

### Loading LSTM model

In [143]:
loaded_lstm_model = keras.models.load_model('/content/drive/My Drive/Shell_AI_train/lstm_model_noimage2.h5')



### Unzip test folder

In [91]:
!unzip -uq "/content/drive/My Drive/test.zip" -d "/content"

### Code to traverse all folders and generate prediction for 30, 60, 90 and 120 minute intervals

In [144]:
path = '/content/test'

def clip_value(val):
  if val > 100:
    return val
  if val < 0:
    return 0
  return val

def obtain_test_data(path, model, selected_features, n_steps):
    files = os.listdir(path)
    count_folder = 0
    final_values = [349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360]
    output_df = {'scenario_set':[], '30_min_horizon':[] , '60_min_horizon':[], '90_min_horizon':[], '120_min_horizon':[]}
    for f in files:
        data_file = 'weather_data.csv'
        cur_df = pd.read_csv(path +'/' + f + '/' + data_file)
        x_input = []
        for val in final_values[-n_steps:]:
            selected_row = cur_df[cur_df['Time [Mins]'] == int(val)]
            cur_data_point = []
            for feature in selected_features:
                  cur_data_point.append(float(selected_row[feature]))
            x_input.append(cur_data_point)
        x_input = np.array(x_input)
        x_input = x_input.reshape((1, n_steps, n_input_features))
        yhat = lstm_model.predict(x_input)[0]
        value_30_min = clip_value(yhat[2])
        value_60_min = clip_value(yhat[5])
        value_90_min = clip_value(yhat[8])
        value_120_min = clip_value(yhat[11])
        #print(f, yhat)
        output_df['scenario_set'].append(int(f))
        output_df['30_min_horizon'].append(value_30_min)
        output_df['60_min_horizon'].append(value_60_min)
        output_df['90_min_horizon'].append(value_90_min)
        output_df['120_min_horizon'].append(value_120_min)
        count_folder += 1
        print(count_folder)
    return output_df

output_df = obtain_test_data(path, lstm_model, selected_features, n_steps)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


### Convert predictions to a dataframe

In [145]:
print(output_df)
output_df = pd.DataFrame(output_df)
output_df = output_df.sort_values('scenario_set')
output_df.head()

{'scenario_set': [175, 239, 24, 230, 294, 244, 275, 144, 160, 185, 47, 209, 50, 242, 60, 93, 268, 117, 202, 171, 119, 5, 18, 282, 212, 83, 121, 221, 15, 196, 169, 125, 255, 205, 184, 135, 21, 227, 29, 143, 263, 231, 251, 133, 261, 94, 42, 219, 37, 109, 52, 151, 53, 155, 206, 159, 204, 293, 56, 103, 4, 291, 77, 289, 105, 147, 182, 168, 36, 34, 63, 3, 87, 139, 145, 224, 67, 234, 200, 256, 97, 295, 163, 149, 100, 253, 173, 108, 31, 292, 161, 203, 250, 55, 25, 156, 68, 271, 16, 12, 129, 279, 247, 80, 216, 131, 199, 220, 248, 82, 72, 102, 88, 277, 146, 81, 112, 61, 118, 89, 48, 13, 27, 187, 46, 71, 177, 189, 104, 54, 99, 197, 284, 198, 136, 191, 17, 158, 32, 150, 264, 130, 257, 141, 193, 188, 154, 20, 243, 115, 162, 123, 233, 223, 40, 297, 152, 259, 39, 285, 281, 73, 232, 148, 213, 267, 11, 298, 283, 44, 280, 62, 70, 14, 110, 91, 201, 1, 266, 116, 207, 238, 272, 286, 7, 179, 124, 26, 84, 8, 41, 192, 30, 43, 111, 9, 211, 229, 22, 226, 172, 245, 106, 2, 134, 190, 75, 254, 222, 69, 258, 218, 2

Unnamed: 0,scenario_set,30_min_horizon,60_min_horizon,90_min_horizon,120_min_horizon
177,1,20.837454,23.241791,20.675722,16.649973
203,2,13.280837,21.326683,13.90289,8.667091
71,3,40.884949,44.126629,35.155983,30.170143
60,4,10.565917,7.182386,5.371658,6.349956
21,5,0.0,0.0,0.0,0.0


### Convert to submission format and download

In [146]:
from google.colab import files
output_df.to_csv('submission_cloudcover.csv', index=False)
files.download('submission_cloudcover.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>