# Lineal Evol - LSTM
### Importing libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Dense

### Loading the train dataset

In [2]:
water_consumption = pd.read_csv("train.csv")
water_consumption.head(10)

Unnamed: 0,Year,Month,Consumer_type,Consumption,Consumer_number,Installation_zone
0,2013,1,domestic,0,MOGV36480546611521,Installation_zone 1
1,2013,1,industrial,5,BECS02817768252637,Installation_zone 2
2,2013,1,domestic,6,VRFW65577141436242,Installation_zone 2
3,2013,1,domestic,1,QLLI18662653137621,Installation_zone 2
4,2013,1,domestic,13,HYUO61823402850645,Installation_zone 2
5,2013,1,industrial,27,FHMG62751338090488,Installation_zone 2
6,2013,1,industrial,5,APVF78863215212358,Installation_zone 2
7,2013,1,domestic,31,MXWL75757930683403,Installation_zone 2
8,2013,1,industrial,2,NVMY31359391120094,Installation_zone 2
9,2013,1,industrial,0,PZAN37359795617576,Installation_zone 2


#### Time Evolution of Consumption
Creation of matrices:

In [3]:
def agg_months(water_consumption, group_features):
  all_months = list(range(1, 13))
  water_consumption_months = water_consumption.groupby(group_features).apply(
      lambda group: pd.Series({
          'Consumption': [group.loc[group['Month'] == month, 'Consumption'].values[0] if month in group['Month'].values else 0 for month in all_months]
      })
  ).reset_index()

  return water_consumption_months

In [4]:
water_consumption_months = agg_months(water_consumption, ['Consumer_type', 'Consumer_number', 'Installation_zone', 'Year'])
water_consumption_months

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Year,Consumption
0,construction,AARN83757551337758,Installation_zone 2,2013,"[0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0]"
1,construction,ACUU45348687147779,Installation_zone 3,2020,"[0, 7, 0, 0, 8, 4, 0, 0, 0, 0, 0, 5]"
2,construction,AGEY60953263888710,Installation_zone 19,2019,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 8]"
3,construction,AGEY60953263888710,Installation_zone 19,2020,"[7, 0, 0, 0, 0, 0, 0, 11, 0, 20, 0, 5]"
4,construction,AKJS11388424975221,Installation_zone 3,2013,"[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...
119417,rural expansion,YOCB56114494094216,Installation_zone 28,2020,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
119418,rural expansion,ZKYR52698984375517,Installation_zone 48,2014,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
119419,rural expansion,ZKYR52698984375517,Installation_zone 48,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
119420,rural expansion,ZKYR52698984375517,Installation_zone 48,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [5]:
def agg_years(water_consumption_months, group_features):
  unique_years = water_consumption_months['Year'].unique()

  water_consumption_years = water_consumption_months.groupby(group_features).apply(
      lambda group: pd.Series({
          'Consumption': [group.loc[group['Year'] == year, 'Consumption'].values[0] if year in group['Year'].values else np.zeros(12) for year in unique_years]
      })
  ).reset_index()

  return water_consumption_years

In [6]:
water_consumption_years = agg_years(water_consumption_months, ['Consumer_type', 'Consumer_number', 'Installation_zone'])
water_consumption_years

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Consumption
0,construction,AARN83757551337758,Installation_zone 2,"[[0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0], [0.0, ..."
1,construction,ACUU45348687147779,Installation_zone 3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,construction,AGEY60953263888710,Installation_zone 19,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,construction,AKJS11388424975221,Installation_zone 3,"[[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, ..."
4,construction,AQGS81063174018273,Installation_zone 2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0.0, 0..."
...,...,...,...,...
27886,rural expansion,YFAO34611929020865,Installation_zone 25,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
27887,rural expansion,YLYR58642775953093,Installation_zone 45,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32], [0, 0,..."
27888,rural expansion,YMSD81498495855641,Installation_zone 10,"[[0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0], [0, 0,..."
27889,rural expansion,YOCB56114494094216,Installation_zone 28,"[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3], [1, 0, ..."


Matrix linearisation:



In [7]:
water_consumption_years['Consumption'] = water_consumption_years['Consumption'].apply(lambda x: [elemento for sublista in x for elemento in sublista])
water_consumption_years

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Consumption
0,construction,AARN83757551337758,Installation_zone 2,"[0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0.0, 0.0..."
1,construction,ACUU45348687147779,Installation_zone 3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,construction,AGEY60953263888710,Installation_zone 19,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,construction,AKJS11388424975221,Installation_zone 3,"[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,construction,AQGS81063174018273,Installation_zone 2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0,..."
...,...,...,...,...
27886,rural expansion,YFAO34611929020865,Installation_zone 25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27887,rural expansion,YLYR58642775953093,Installation_zone 45,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0,..."
27888,rural expansion,YMSD81498495855641,Installation_zone 10,"[0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 6,..."
27889,rural expansion,YOCB56114494094216,Installation_zone 28,"[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 1, 0, 0, ..."


Addition of the installation index to the consumption matrix:

In [8]:
water_consumption_years['Installation_number'] = water_consumption_years['Installation_zone'].str.extract('(\d+)').astype(int)
water_consumption_years

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Consumption,Installation_number
0,construction,AARN83757551337758,Installation_zone 2,"[0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0.0, 0.0...",2
1,construction,ACUU45348687147779,Installation_zone 3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
2,construction,AGEY60953263888710,Installation_zone 19,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",19
3,construction,AKJS11388424975221,Installation_zone 3,"[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3
4,construction,AQGS81063174018273,Installation_zone 2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0,...",2
...,...,...,...,...,...
27886,rural expansion,YFAO34611929020865,Installation_zone 25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",25
27887,rural expansion,YLYR58642775953093,Installation_zone 45,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0,...",45
27888,rural expansion,YMSD81498495855641,Installation_zone 10,"[0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 6,...",10
27889,rural expansion,YOCB56114494094216,Installation_zone 28,"[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 1, 0, 0, ...",28


In [9]:
water_consumption_years['Consumption'] = water_consumption_years.apply(lambda row: [row['Installation_number']] + row['Consumption'], axis=1)
water_consumption_years

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Consumption,Installation_number
0,construction,AARN83757551337758,Installation_zone 2,"[2, 0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0.0, ...",2
1,construction,ACUU45348687147779,Installation_zone 3,"[3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",3
2,construction,AGEY60953263888710,Installation_zone 19,"[19, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",19
3,construction,AKJS11388424975221,Installation_zone 3,"[3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3
4,construction,AQGS81063174018273,Installation_zone 2,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0...",2
...,...,...,...,...,...
27886,rural expansion,YFAO34611929020865,Installation_zone 25,"[25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",25
27887,rural expansion,YLYR58642775953093,Installation_zone 45,"[45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0...",45
27888,rural expansion,YMSD81498495855641,Installation_zone 10,"[10, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0...",10
27889,rural expansion,YOCB56114494094216,Installation_zone 28,"[28, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 1, 0,...",28


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    np.array(water_consumption_years['Consumption'].tolist()),
    pd.get_dummies(water_consumption_years['Consumer_type']).values,
    test_size=0.2, random_state=42
)

X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

#### Model

In [11]:
unique_labels = water_consumption_years['Consumer_type'].unique()

index2label = {index: label for index, label in enumerate(unique_labels)}

In [12]:
model = Sequential()

model.add(LSTM(10, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(len(water_consumption_years['Consumer_type'].unique()), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd5a298bac0>

In [14]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 88.80%


## Competition

In [15]:
competition = pd.read_csv("competition.csv")
competition.head(10)

Unnamed: 0,Year,Month,Consumption,Consumer_number,Installation_zone
0,2013,1,1,VENX08444954462680,Installation_zone 1
1,2013,1,2,GRXC33020746550125,Installation_zone 1
2,2013,1,1,FCGQ19814303536339,Installation_zone 1
3,2013,1,5,EQKL85694875580467,Installation_zone 3
4,2013,1,14,KCXW91343862250032,Installation_zone 3
5,2013,1,10,NFMC42616650055728,Installation_zone 2
6,2013,1,9,JDVW57666669484928,Installation_zone 1
7,2013,1,2,HGRS67554693069282,Installation_zone 2
8,2013,1,19,WJYX39670413648529,Installation_zone 3
9,2013,1,23,CGDO45046562545022,Installation_zone 1


### Predict

In [16]:
competition_months = agg_months(competition, ['Consumer_number', 'Installation_zone', 'Year'])
competition_years = agg_years(competition_months, ['Consumer_number', 'Installation_zone'])

competition_years = competition_years.groupby('Consumer_number')[['Installation_zone', 'Consumption']].last().reset_index()

competition_years['Consumption'] = competition_years['Consumption'].apply(lambda x: [elemento for sublista in x for elemento in sublista])
competition_years['Installation_number'] = competition_years['Installation_zone'].str.extract('(\d+)').astype(int)

competition_years['Consumption'] = competition_years.apply(lambda row: [row['Installation_number']] + row['Consumption'], axis=1)

In [17]:
X_competition = np.expand_dims(np.array(competition_years['Consumption'].tolist()), axis=1)

In [18]:
y_pred_proba = np.argmax(model.predict(X_competition), axis=1)

all_y_preds = list()

for pred in y_pred_proba:
  all_y_preds.append(index2label[pred])

all_y_preds = pd.DataFrame({'Pred': all_y_preds})
competition_years['Consumer_type'] = all_y_preds



In [19]:
competition_years.head(10)

Unnamed: 0,Consumer_number,Installation_zone,Consumption,Installation_number,Consumer_type
0,AACP61877322721317,Installation_zone 32,"[32, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 2, 0, 0,...",32,rural domestic
1,AATX61161116356557,Installation_zone 1,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",1,domestic
2,ABEJ68950564531553,Installation_zone 3,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,domestic
3,ABUM50429026998266,Installation_zone 33,"[33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",33,rural domestic
4,ADBQ44944453582333,Installation_zone 13,"[13, 0, 0, 0, 3, 0, 4, 0, 7, 0, 2, 0, 2, 0, 0,...",13,rural domestic
5,ADKJ26366564447127,Installation_zone 44,"[44, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",44,rural domestic
6,ADYG46035417336230,Installation_zone 1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,domestic
7,AEBU54480786458823,Installation_zone 44,"[44, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",44,rural domestic
8,AEQZ48426066915128,Installation_zone 26,"[26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",26,rural domestic
9,AESM26658198482962,Installation_zone 4,"[4, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",4,domestic


In [20]:
competition_years[['Consumer_number', 'Consumer_type']].to_csv('Lineal-LSTM-Outcomes.csv', index=False)