# LSTM
### Importing libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

### Loading the train dataset

In [2]:
water_consumption = pd.read_csv("train.csv")
water_consumption.head(10)

Unnamed: 0,Year,Month,Consumer_type,Consumption,Consumer_number,Installation_zone
0,2013,1,domestic,0,MOGV36480546611521,Installation_zone 1
1,2013,1,industrial,5,BECS02817768252637,Installation_zone 2
2,2013,1,domestic,6,VRFW65577141436242,Installation_zone 2
3,2013,1,domestic,1,QLLI18662653137621,Installation_zone 2
4,2013,1,domestic,13,HYUO61823402850645,Installation_zone 2
5,2013,1,industrial,27,FHMG62751338090488,Installation_zone 2
6,2013,1,industrial,5,APVF78863215212358,Installation_zone 2
7,2013,1,domestic,31,MXWL75757930683403,Installation_zone 2
8,2013,1,industrial,2,NVMY31359391120094,Installation_zone 2
9,2013,1,industrial,0,PZAN37359795617576,Installation_zone 2


## First approach

We generate an array with the values for each consumption for each user throughout all the months of the year, in the case that there is no value we add zero (this is done in order not to lose the index of the months in the array).

In [3]:
all_months = list(range(1, 13))
water_consumption_months = water_consumption.groupby(['Consumer_type', 'Consumer_number', 'Installation_zone', 'Year']).apply(
    lambda group: pd.Series({
        'Consumption': [group.loc[group['Month'] == month, 'Consumption'].values[0] if month in group['Month'].values else 0 for month in all_months]
    })
).reset_index()

water_consumption_months

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Year,Consumption
0,construction,AARN83757551337758,Installation_zone 2,2013,"[0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0]"
1,construction,ACUU45348687147779,Installation_zone 3,2020,"[0, 7, 0, 0, 8, 4, 0, 0, 0, 0, 0, 5]"
2,construction,AGEY60953263888710,Installation_zone 19,2019,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 8]"
3,construction,AGEY60953263888710,Installation_zone 19,2020,"[7, 0, 0, 0, 0, 0, 0, 11, 0, 20, 0, 5]"
4,construction,AKJS11388424975221,Installation_zone 3,2013,"[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...
119417,rural expansion,YOCB56114494094216,Installation_zone 28,2020,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
119418,rural expansion,ZKYR52698984375517,Installation_zone 48,2014,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
119419,rural expansion,ZKYR52698984375517,Installation_zone 48,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
119420,rural expansion,ZKYR52698984375517,Installation_zone 48,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [4]:
unique_years = water_consumption_months['Year'].unique()

water_consumption_years = water_consumption_months.groupby(['Consumer_type', 'Consumer_number', 'Installation_zone']).apply(
    lambda group: pd.Series({
        'Consumption': [group.loc[group['Year'] == year, 'Consumption'].values[0] if year in group['Year'].values else np.zeros(12) for year in unique_years]
    })
).reset_index()

water_consumption_years

Unnamed: 0,Consumer_type,Consumer_number,Installation_zone,Consumption
0,construction,AARN83757551337758,Installation_zone 2,"[[0, 0, 10, 0, 0, 0, 0, 0, 7, 0, 0, 0], [0.0, ..."
1,construction,ACUU45348687147779,Installation_zone 3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,construction,AGEY60953263888710,Installation_zone 19,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,construction,AKJS11388424975221,Installation_zone 3,"[[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, ..."
4,construction,AQGS81063174018273,Installation_zone 2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0.0, 0..."
...,...,...,...,...
27886,rural expansion,YFAO34611929020865,Installation_zone 25,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
27887,rural expansion,YLYR58642775953093,Installation_zone 45,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32], [0, 0,..."
27888,rural expansion,YMSD81498495855641,Installation_zone 10,"[[0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0], [0, 0,..."
27889,rural expansion,YOCB56114494094216,Installation_zone 28,"[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3], [1, 0, ..."


The original idea is for each independent user to work with an m x n matrix, where m corresponds to years and n to months.

### Preprocess and split data

In [5]:
def preprocess_data(df, target_column='Consumer_type', feature_column='Consumption', test_size=0.2, random_state=42):
    X = np.array(df[feature_column].tolist())
    X = np.transpose(X, (0, 2, 1))

    y = pd.get_dummies(df[target_column]).values

    return X, y

In [6]:
def split_data(X, y, test_size=0.2, random_state=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2]))
    X_test = X_test.reshape((X_test.shape[0], X_train.shape[1], X_test.shape[2]))

    return X_train, X_test, y_train, y_test

In [7]:
X, y = preprocess_data(water_consumption_years, 'Consumer_type', 'Consumption')

X_train, X_test, y_train, y_test = split_data(X, y, 0.2, 42)

## LSTM model

In [8]:
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a333886b370>

In [10]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 73.88%
