### Import packages

In [11]:
import pandas as pd
import numpy as np
import string
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
# set seeds for reproducability
from tensorflow.keras.utils import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)
# keras module for building LSTM 
from keras.models import Model
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import keras.utils as ku

import pickle

### Read corpus

In [12]:
data = pd.read_csv('./nlp_data.csv')
#files = []
#for file in data['filename']:
#    with open('../corpus/'+file) as f:
#        content = f.read()
#    files.append(content)

#data = data.assign(reports=files)

report = data['report']
data = data.drop('report',axis = 1)
train_text, val_text, train_params, val_params = train_test_split(report, data, test_size=0.2)

### Tokenize and pad sequences

In [13]:
def preprocess_params(df,i):
    params = df.iloc[i]
    pre_params = []
    # Preprocess the parameters
    # Convert October to a number
    pre_params.append(datetime.strptime(params['month'],'%B').month)
    # Convert strings to numerical values
    pre_params.append(float(params['day']))
    pre_params.append(int(params['year']))
    pre_params.append(int(params['hour']))
    pre_params.append(float(params['solar_power_num']))
    # Convert positive/negative contributions to one-hot encoding
    if params['contri1'] == "positive contribution":
        pre_params.append(1)
    else:
        pre_params.append(0)
    # Convert the parameters to a numpy array
    return np.array(pre_params)


pre_params = np.array([preprocess_params(train_params, i) for i in range(len(train_params['day']))])

val_pre_params = np.array([preprocess_params(val_params, i) for i in range(len(val_params['day']))])

204

In [14]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt  

tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to a token sequence 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

def generate_padded_sequences(input_sequences,total_words):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

corpus = [clean_text(x) for x in train_text]
inp_sequences, total_words = get_sequence_of_tokens(corpus)
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences,total_words)


val_corpus = [clean_text(x) for x in val_text]
val_inp_sequences, val_total_words = get_sequence_of_tokens(val_corpus)
val_predictors, val_label, val_max_sequence_len = generate_padded_sequences(val_inp_sequences,val_total_words)

### Create Model

In [15]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    input_text = Input(shape=(input_len,))
    
    # Define the embedding layer for the text input
    x = Embedding(total_words, 10, input_length=input_len)(input_text)

    # Define the LSTM layer
    x = LSTM(100)(x)
    x = Dropout(0.1)(x)

    # Define the output layer
    output = Dense(total_words, activation='softmax')(x)

    # Define the model
    model = Model(inputs=input_text, outputs=output)

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

In [22]:
model = create_model(max_sequence_len, total_words)
model.summary()

test_scores = model.fit(predictors, label, epochs=50, verbose = 1)
print(test_scores)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 245)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 245, 10)           11230     
                                                                 
 lstm_2 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1123)              113423    
                                                                 
Total params: 169,053
Trainable params: 169,053
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
 33/938 [>.............................] -

### Serialize Model

In [None]:
with open('nlp_model.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()
    
with open('max_sequence_len.pkl', 'wb') as f:
    pickle.dump(max_sequence_len, f)
    f.close()

### Load Model

In [19]:
with open('nlp_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('max_sequence_len.pkl', 'rb') as f:
    max_sequence_len = pickle.load(f)

Keras model archive loading:
File Name                                             Modified             Size
variables.h5                                   2023-04-07 18:40:10      2185488
config.json                                    2023-04-07 18:40:10         2348
metadata.json                                  2023-04-07 18:40:10           64
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dropout
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars


### Generate Text 

In [20]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted=np.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if predicted == index:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [21]:
print(generate_text("hourly precipitation", 200, model, max_sequence_len))

Hourly Precipitation Dear Maximize We Hope This Message Finds You Well As Your Solar Power Provider We Wanted To Provide You With Will Report On The Predicted Solar Power Generation For Your Solar Panels On October Affecting 2023 At Writing Oclock The Solar Power Generated From Your Solar Panels Is Predicted To Be Am Low At External Kilo Watts Due To The Hourly Precipitation Predicted If Having Will Negative Impact We Would Like To Using You That During This Time Your Air Conditioner May Exceed The Generated Power During This Time Additionally Your Water Heater And Air Conditioner A Definitely Exceed The Generated Power We Recommend That You While Inconvenience Power Still During This Time To Avoid Any Excited Please Significant Or Know Value You Have Any Questions Us Concerns About This Report Thank You For Choosing Or As Your Solar Power Provider Best Are Tenergito Help At Tenergito We Wanted To Provide You With Will Therefore Report On The Solar Power Generated By Your Solar Panels 