# EnsLSTM
### Importing libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import ReduceLROnPlateau

### Loading the train dataset

In [2]:
water_consumption = pd.read_csv("train.csv")
water_consumption.head(10)

Unnamed: 0,Year,Month,Consumer_type,Consumption,Consumer_number,Installation_zone
0,2013,1,domestic,0,MOGV36480546611521,Installation_zone 1
1,2013,1,industrial,5,BECS02817768252637,Installation_zone 2
2,2013,1,domestic,6,VRFW65577141436242,Installation_zone 2
3,2013,1,domestic,1,QLLI18662653137621,Installation_zone 2
4,2013,1,domestic,13,HYUO61823402850645,Installation_zone 2
5,2013,1,industrial,27,FHMG62751338090488,Installation_zone 2
6,2013,1,industrial,5,APVF78863215212358,Installation_zone 2
7,2013,1,domestic,31,MXWL75757930683403,Installation_zone 2
8,2013,1,industrial,2,NVMY31359391120094,Installation_zone 2
9,2013,1,industrial,0,PZAN37359795617576,Installation_zone 2


In [3]:
train, test = train_test_split(water_consumption, test_size=0.2,
                               stratify=water_consumption['Consumer_type'], random_state=42)

## Model

In [4]:
class EnsLSTM():
  def __init__(self):
    self.networks = {}
    self.group_names = list()

  def fit(self, water_consumption):
    print("Creating temporal matrix ...")

    water_consumption_months = self.agg_months(water_consumption,
     ['Consumer_type', 'Consumer_number', 'Installation_zone', 'Year'])

    water_consumption_years = self.agg_years(water_consumption_months,
     ['Consumer_type', 'Consumer_number', 'Installation_zone'])

    installation_groups = self.create_groups(water_consumption_years)

    print("Training ...")
    for group_name, group in installation_groups:
      group_name_txt = ','.join(group_name)
      self.group_names.append(group_name_txt)
      print(group_name_txt)

      X_data, y_data = self.preprocess_data(group,
                                     target_column='Consumer_type',
                                     feature_column='Consumption')

      unique_labels = group['Consumer_type'].unique()

      index2label = {index: label for index, label in enumerate(unique_labels)}

      net = self.createIndLSTM(X_data.shape, len(group['Consumer_type'].unique()))

      reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

      net.fit(X_data, y_data, epochs=10, batch_size=16, validation_split=0.2, callbacks=[reduce_lr])

      self.networks[group_name_txt] = {}
      self.networks[group_name_txt]['net'] = net
      self.networks[group_name_txt]['labels'] = index2label

  def agg_months(self, water_consumption, group_features):
    all_months = list(range(1, 13))
    water_consumption_months = water_consumption.groupby(group_features).apply(
        lambda group: pd.Series({
            'Consumption': [group.loc[group['Month'] == month, 'Consumption'].values[0] if month in group['Month'].values else 0 for month in all_months]
        })
    ).reset_index()

    return water_consumption_months

  def agg_years(self, water_consumption_months, group_features):
    unique_years = water_consumption_months['Year'].unique()

    water_consumption_years = water_consumption_months.groupby(group_features).apply(
        lambda group: pd.Series({
            'Consumption': [group.loc[group['Year'] == year, 'Consumption'].values[0] if year in group['Year'].values else np.zeros(12) for year in unique_years]
        })
    ).reset_index()

    return water_consumption_years

  def create_groups(self, water_consumption_years):
    inst_consumer_types = water_consumption.groupby(['Consumer_type', 'Installation_zone']).size().reset_index(name='Count')

    inst_consumer_types = pd.get_dummies(inst_consumer_types, columns=['Consumer_type'], prefix='Consumer_type')
    inst_consumer_types = inst_consumer_types.groupby(['Installation_zone']).agg({
        'Count': 'sum',
        'Consumer_type_construction': 'sum',
        'Consumer_type_domestic': 'sum',
        'Consumer_type_industrial': 'sum',
        'Consumer_type_low income families': 'sum',
        'Consumer_type_rural commercial': 'sum',
        'Consumer_type_rural expansion': 'sum',
        'Consumer_type_rural domestic': 'sum'
      }).reset_index()

    inst_consumer_types['Consumer_types'] = inst_consumer_types.iloc[:, 2:].astype(str).apply(lambda row: ','.join(map(str, row)), axis=1)

    inst_consumer_types = inst_consumer_types.groupby(['Consumer_types']).agg({
        'Count': 'sum',
        'Installation_zone': lambda x: ','.join(map(str, x))
      }).reset_index()

    inst_consumer_types['Installation_zone'] = inst_consumer_types['Installation_zone'].str.split(',')

    installation_groups = []
    specific_group = ['Installation_zone 1', 'Installation_zone 2', 'Installation_zone 3', 'Installation_zone 4', 'Installation_zone 29']

    for installations in inst_consumer_types['Installation_zone']:

          if installations[0] not in specific_group:
            df_by_installations = water_consumption_years[water_consumption_years['Installation_zone'].isin(installations)]

            installation_groups.append((installations, df_by_installations))

    df_by_installations = water_consumption_years[water_consumption_years['Installation_zone'].isin(specific_group)]

    installation_groups.append((specific_group, df_by_installations))

    return installation_groups

  def preprocess_data(self, df, target_column=None, feature_column='Consumption', test_size=0.2, random_state=42):
    X = np.array(df[feature_column].tolist())
    # X = np.transpose(X, (0, 2, 1))

    if target_column == None:
      return X

    y = pd.get_dummies(df[target_column]).values

    return X, y

  def createIndLSTM(self, shape, num_labels):
    net = Sequential()
    net.add(LSTM(10, input_shape=(shape[1], shape[2])))
    net.add(Dense(num_labels, activation='softmax'))

    optimizer = Adam(learning_rate=0.001)

    if num_labels > 1:
      net.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    else:
      net.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return net

  def eval(self, water_consumption):
    print("Creating temporal matrix ...")

    water_consumption_months = self.agg_months(water_consumption,
     ['Consumer_type', 'Consumer_number', 'Installation_zone', 'Year'])

    water_consumption_years = self.agg_years(water_consumption_months,
     ['Consumer_type', 'Consumer_number', 'Installation_zone'])

    installation_groups = self.divide_groups(water_consumption_years)

    print("Test ...")

    right = 0
    total = 0
    for group_name, group in installation_groups:
      print(group_name)
      X_data, y_data = self.preprocess_data(group,
                                     target_column='Consumer_type',
                                     feature_column='Consumption')


      y_pred_proba = self.networks[group_name]['net'].predict(X_data)

      y_data = np.argmax(y_data, axis=1)
      y_pred = np.argmax(y_pred_proba, axis=1)

      tmp_right = np.sum(y_data == y_pred)
      right += tmp_right
      total += len(y_data)

    return right / total

  def divide_groups(self, water_consumption_years):
    agg_inst = []

    for index, row in water_consumption_years.iterrows():
      for i, sub_array in enumerate(self.group_names):
        if row['Installation_zone'] in sub_array.split(","):
          agg_inst.append(sub_array)
          break

    water_consumption_years['Installation_zones'] = agg_inst
    water_consumption_years = water_consumption_years.drop('Installation_zone', axis=1)
    water_consumption_years = water_consumption_years.groupby('Installation_zones')

    return water_consumption_years

  def predict(self, water_consumption):
    print("Creating temporal matrix ...")
    water_consumption_months = self.agg_months(water_consumption,
     ['Consumer_number', 'Installation_zone', 'Year'])
    water_consumption_years = self.agg_years(water_consumption_months,
     ['Consumer_number', 'Installation_zone'])

    water_consumption_years = water_consumption_years.groupby('Consumer_number')[['Installation_zone', 'Consumption']].last().reset_index()

    installation_groups = self.divide_groups(water_consumption_years)

    print("Predict ...")

    all_y_preds = list()
    all_groups = pd.DataFrame()
    for group_name, group in installation_groups:
      print(group_name)
      X_data = self.preprocess_data(group,
                                    feature_column='Consumption')

      y_pred_proba = self.networks[group_name]['net'].predict(X_data)

      for pred in np.argmax(y_pred_proba, axis=1):
        all_y_preds.append(self.networks[group_name]['labels'][pred])

      all_groups = pd.concat([all_groups, group], ignore_index=True)

    all_y_preds = pd.DataFrame({'Pred': all_y_preds})
    all_groups['Consumer_type'] = all_y_preds

    return all_groups

#### Train

In [5]:
network = EnsLSTM()
network.fit(train)

Creating temporal matrix ...
Training ...
Installation_zone 15,Installation_zone 27,Installation_zone 30
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 17,Installation_zone 21,Installation_zone 32,Installation_zone 44
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 40,Installation_zone 45
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 20,Installation_zone 37,Installation_zone 43
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 26,Installation_zone 31
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 13,Installation_zone 18
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
E

#### Test

In [6]:
acc = network.eval(test)
print("Acc: " + str(acc))

Creating temporal matrix ...
Test ...
Installation_zone 1,Installation_zone 2,Installation_zone 3,Installation_zone 4,Installation_zone 29
Installation_zone 10
Installation_zone 11,Installation_zone 14,Installation_zone 19,Installation_zone 23,Installation_zone 25,Installation_zone 28,Installation_zone 36,Installation_zone 41,Installation_zone 48,Installation_zone 6,Installation_zone 7
Installation_zone 12,Installation_zone 39,Installation_zone 49,Installation_zone 5,Installation_zone 9
Installation_zone 13,Installation_zone 18
Installation_zone 15,Installation_zone 27,Installation_zone 30
Installation_zone 16,Installation_zone 35
Installation_zone 17,Installation_zone 21,Installation_zone 32,Installation_zone 44
Installation_zone 20,Installation_zone 37,Installation_zone 43
Installation_zone 22
Installation_zone 24,Installation_zone 38,Installation_zone 46
Installation_zone 26,Installation_zone 31
Installation_zone 33,Installation_zone 47
Installation_zone 34
Installation_zone 40,Inst

#### Eval

In [7]:
competition = pd.read_csv("competition.csv")
competition.head(10)

Unnamed: 0,Year,Month,Consumption,Consumer_number,Installation_zone
0,2013,1,1,VENX08444954462680,Installation_zone 1
1,2013,1,2,GRXC33020746550125,Installation_zone 1
2,2013,1,1,FCGQ19814303536339,Installation_zone 1
3,2013,1,5,EQKL85694875580467,Installation_zone 3
4,2013,1,14,KCXW91343862250032,Installation_zone 3
5,2013,1,10,NFMC42616650055728,Installation_zone 2
6,2013,1,9,JDVW57666669484928,Installation_zone 1
7,2013,1,2,HGRS67554693069282,Installation_zone 2
8,2013,1,19,WJYX39670413648529,Installation_zone 3
9,2013,1,23,CGDO45046562545022,Installation_zone 1


In [8]:
network = EnsLSTM()
network.fit(water_consumption)

Creating temporal matrix ...
Training ...
Installation_zone 15,Installation_zone 27,Installation_zone 30
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 17,Installation_zone 21,Installation_zone 32,Installation_zone 44
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 40,Installation_zone 45
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 20,Installation_zone 37,Installation_zone 43
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 26,Installation_zone 31
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Installation_zone 13,Installation_zone 18
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
E

In [9]:
competition_results = network.predict(competition)

Creating temporal matrix ...
Predict ...
Installation_zone 1,Installation_zone 2,Installation_zone 3,Installation_zone 4,Installation_zone 29
Installation_zone 10
Installation_zone 11,Installation_zone 14,Installation_zone 19,Installation_zone 23,Installation_zone 25,Installation_zone 28,Installation_zone 36,Installation_zone 41,Installation_zone 48,Installation_zone 6,Installation_zone 7
Installation_zone 12,Installation_zone 39,Installation_zone 49,Installation_zone 5,Installation_zone 9
Installation_zone 13,Installation_zone 18




Installation_zone 15,Installation_zone 27,Installation_zone 30




Installation_zone 16,Installation_zone 35
Installation_zone 17,Installation_zone 21,Installation_zone 32,Installation_zone 44
Installation_zone 20,Installation_zone 37,Installation_zone 43
Installation_zone 22
Installation_zone 24,Installation_zone 38,Installation_zone 46
Installation_zone 26,Installation_zone 31
Installation_zone 33,Installation_zone 47
Installation_zone 34
Installation_zone 40,Installation_zone 45
Installation_zone 42,Installation_zone 8


In [10]:
competition_results

Unnamed: 0,Consumer_number,Consumption,Installation_zones,Consumer_type
0,AATX61161116356557,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","Installation_zone 1,Installation_zone 2,Instal...",domestic
1,ABEJ68950564531553,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, ...","Installation_zone 1,Installation_zone 2,Instal...",domestic
2,ADYG46035417336230,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, ...","Installation_zone 1,Installation_zone 2,Instal...",domestic
3,AESM26658198482962,"[[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, ...","Installation_zone 1,Installation_zone 2,Instal...",domestic
4,AFZX94165604206691,"[[0, 0, 0, 0, 0, 0, 0, 8, 9, 0, 0, 0], [0.0, 0...","Installation_zone 1,Installation_zone 2,Instal...",domestic
...,...,...,...,...
1633,WOXW56589127716354,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","Installation_zone 42,Installation_zone 8",rural domestic
1634,XCFK76326917324814,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","Installation_zone 42,Installation_zone 8",rural domestic
1635,YPRO01545510518187,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","Installation_zone 42,Installation_zone 8",rural domestic
1636,ZGIS63488309737437,"[[2, 0, 0, 1, 0, 0, 0, 3, 1, 4, 1, 0], [1, 2, ...","Installation_zone 42,Installation_zone 8",rural domestic


In [11]:
competition_results[['Consumer_number', 'Consumer_type']].to_csv('competition_results.csv', index=False)