### Import packages

In [4]:
import pandas as pd
import numpy as np
import string, os
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
# set seeds for reproducability
import tensorflow as tf

from tensorflow.keras.utils import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)
# keras module for building LSTM 
from keras.models import Model
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import keras.utils as ku

import pickle

### Read corpus

In [5]:
data = pd.read_csv('./nlp_data.csv')
#files = []
#for file in data['filename']:
#    with open('../corpus/'+file) as f:
#        content = f.read()
#    files.append(content)

#data = data.assign(reports=files)

report = data['report']
data = data.drop('report',axis = 1)
colnames = data.columns
train_text, val_text, train_params, val_params = train_test_split(report, data, test_size=0.2)

train_params = train_params.values.tolist()
train_params = pd.DataFrame(train_params,columns=colnames)
val_params = val_params.values.tolist()
val_params = pd.DataFrame(val_params,columns=colnames)

FileNotFoundError: [Errno 2] No such file or directory: '../corpus/report_1_1.txt'

### Tokenize and pad sequences

In [6]:
def preprocess_params(df,i):
    
    print(df)
    
    params = df.iloc[i]
    pre_params = []
    # Preprocess the parameters
    # Convert October to a number
    pre_params.append(datetime.strptime(params['month'],'%B').month)
    # Convert strings to numerical values
    pre_params.append(float(params['day']))
    pre_params.append(int(params['year']))
    pre_params.append(int(params['hour']))
    pre_params.append(float(params['solar_power_num']))
    # Convert positive/negative contributions to one-hot encoding
    if params['contri1'] == "positive contribution":
        pre_params.append(ku.to_categorical(0, num_classes=2))
    else:
        pre_params.append(ku.to_categorical(1, num_classes=2))
    # Convert the parameters to a numpy array
    return pre_params

pre_params = [preprocess_params(train_params, i) for i in range(len(train_params['day']))]
print(pre_params[:10])
val_pre_params = [preprocess_params(val_params, i) for i in range(len(val_params['day']))]
print(val_pre_params[:10])

     Unnamed: 0  day      month  year  hour    client solar_power_cat  \
0            35   12     August  2023    12       Ana       very high   
1            11   17   December  2023     8   Ricardo        very low   
2            29   23   February  2023    10       Ana          medium   
3             0    7    October  2023    15       Ema            high   
4           159   11      March  2023    17    Rafael             low   
..          ...  ...        ...   ...   ...       ...             ...   
199         137    8    January  2023    11       Ana          medium   
200          72    4  September  2023    13  Franciso          medium   
201         140   26   December  2024    19     Diana        very low   
202         235    5  September  2023    18   Ricardo          medium   
203          37    5       June  2024    13     Diana          medium   

     solar_power_num                 feat1                 feat2  \
0              5.667           temperature             

In [7]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt  

tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to a token sequence 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

def generate_padded_sequences(input_sequences,total_words):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

corpus = [clean_text(x) for x in train_text]
print(corpus[:10])
inp_sequences, total_words = get_sequence_of_tokens(corpus)
print(inp_sequences[:10])
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences,total_words)


val_corpus = [clean_text(x) for x in val_text]
print(val_corpus[:10])
val_inp_sequences, val_total_words = get_sequence_of_tokens(val_corpus)
print(val_inp_sequences[:10])
val_predictors, val_label, val_max_sequence_len = generate_padded_sequences(inp_sequences,val_total_words)

['\n\nsolar power prediction report for ana  august 12nd 2023 1200 pm\n\ndear ana\n\nwe have generated a solar power prediction report for you based on the data collected from your solar panels on august 12nd 2023 at 1200 pm here are the details\n\ngenerated power 5667 kw\n\ntemperature 85f\n\nappliances usable\n\n heater\n water heater\n air conditioner\n\nappliances not usable\n\n dishwasher\n\nthe solar power generated by your solar panels at this time is predicted to be very high which means you can use your heater water heater and air conditioner without any issues however your dishwasher might exceed the generated power so it is recommended to avoid using it during this time\n\nplease note that the predicted values may not be accurate due to various factors such as weather conditions panel efficiency and maintenance however we have tried our best to provide you with the most reliable prediction based on the available data\n\nif you have any further questions or concerns please do

### Create Model

In [8]:
def create_model(max_sequence_len, total_words, num_params):
    input_len = max_sequence_len - 1
    input_text = Input(shape=(input_len,))
    input_params = Input(shape=(num_params,))
    # Define the embedding layer for the text input
    x = Embedding(total_words, 10, input_length=input_len)(input_text)

    # Define the LSTM layer
    x = LSTM(100)(x)
    x = Dropout(0.1)(x)

    # Concatenate the LSTM output with the input parameters
    x = Concatenate()([x, input_params])

    # Define the output layer
    output = Dense(total_words, activation='softmax')(x)

    # Define the model
    model = Model(inputs=[input_text, input_params], outputs=output)

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

model = create_model(max_sequence_len, total_words,6)
model.summary()

model.fit([predictors, pre_params], label, validation_data=([val_predictors, val_pre_params], val_label), epochs=100, verbose = 1)

2023-04-08 14:56:34.037786: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-08 14:56:34.045116: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-08 14:56:34.045414: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-08 14:56:34.046133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 245)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 245, 10)      11230       ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 100)          44400       ['embedding[0][0]']              
                                                                                                  
 dropout (Dropout)              (None, 100)          0           ['lstm[0][0]']                   
                                                                                              

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {\'(<class \\\'list\\\'> containing values of types {"<class \\\'int\\\'>", "<class \\\'float\\\'>", "<class \\\'numpy.ndarray\\\'>"})\'})', "<class 'numpy.ndarray'>"}), <class 'numpy.ndarray'>

### Serialize Model

In [None]:
with open('nlp_model.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()
    
with open('max_sequence_len.pkl', 'wb') as f:
    pickle.dump(max_sequence_len, f)
    f.close()

### Load Model

In [None]:
with open('nlp_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('max_sequence_len.pkl', 'rb') as f:
    max_sequence_len = pickle.load(f)

### Generate Text 

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted=np.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if predicted == index:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

def generate_text_params(model, tokenizer, max_sequence_len, seed_text, num_gen_words, params):
    # Preprocess the parameters
    params = preprocess_params(params)
    # Generate words one by one
    generated = ""
    for i in range(num_gen_words):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # Pad the tokenized sequence to the desired length
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # Convert the parameters to a numpy array
        params_arr = np.array([params])
        # Generate the next word using the model and parameters
        probs = model.predict([token_list, params_arr])[0]
        y_pred = np.random.choice(range(1, total_words), p=probs)
        # Convert the predicted word back to a string
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == y_pred:
                output_word = word
                break
        # Append the predicted word to the generated text
        generated += " " + output_word
        # Update the seed text with the predicted word
        seed_text += " " + output_word
    return generated.strip()

In [None]:
print(generate_text("hourly precipitation", 200, model, max_sequence_len))