### Import packages

In [1]:
import pandas as pd
import numpy as np
import string
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
# set seeds for reproducability
from tensorflow.keras.utils import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)
# keras module for building LSTM 
from keras.models import Model
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import keras.utils as ku

import pickle

2023-04-11 18:34:44.211423: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-11 18:34:45.292636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/pardalito/miniconda3/envs/meia/lib/
2023-04-11 18:34:45.292793: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/pardalito/miniconda3/envs/meia/lib/


### Read corpus

In [2]:
data = pd.read_csv('./nlp_data.csv')
#files = []
#for file in data['filename']:
#    with open('../corpus/'+file) as f:
#        content = f.read()
#    files.append(content)

#data = data.assign(reports=files)

report = data['report']
data = data.drop('report',axis = 1)

#### Clean Text

In [3]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt  

corpus = [clean_text(x) for x in report]

In [4]:
corpus

['report for solar power predictions for ema  october 7th 2023  1500 hours\n\nbased on our predictions the solar power generated by your solar panels at 1500 hours on october 7th 2023 will be high the predicted value is 10143 kilo watts\n\nthis high power generation can be attributed to the positive contribution of dewpoint therefore you can use your dishwasher and air conditioner without worrying about exceeding the generated power\n\nhowever it is important to note that the appliance water heater will definitely exceed the generated power therefore we recommend minimizing its usage during this time\n\nthank you\n\nbest regards\ntenergito',
 'dear ana\n\nwe hope this message finds you well as one of our valued solar panel clients we want to keep you informed about the predicted solar power that will be generated from your panels\n\naccording to our analysis on june 8th 2023 at 400 pm you can expect a medium amount of solar power to be generated with a predicted value of 28 kilo watts 

### Preprocess Params
Transform the parameters in numbers

In [3]:
def preprocess_params(df,i):
    params = df.iloc[i]
    pre_params = []
    
    pre_params.append(datetime.strptime(params['month'],'%B').month)
    pre_params.append(float(params['day']))
    pre_params.append(int(params['year']))
    pre_params.append(int(params['hour']))
    pre_params.append(float(params['solar_power_num']))
    
    #Need to convert to right enconding
    if params['contri1'] == "positive contribution":
        pre_params.append(1)
    else:
        pre_params.append(0)
    
    return np.array(pre_params)

pre_params = np.array([preprocess_params(data, i) for i in range(len(data['day']))])

#### Tokenize the corpus

In [None]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to a token sequence 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        lone = []
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            lone.append(n_gram_sequence)
        input_sequences.append(lone)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

#### Pad the sequences

In [4]:
def generate_padded_sequences(input_sequences,total_words,params):
    max_sequence_len = max([len(x) for x in input_sequences])
    predictors = []
    labels = []
    for example in inp_sequences:
        input_sequences = np.array(pad_sequences(example, maxlen=max_sequence_len, padding='post', truncating='post'))
        predictor, label = input_sequences[:,:-1],input_sequences[:,-1]
        label = ku.to_categorical(label, num_classes=total_words)
        predictors.append(predictor)
        labels.append(label)
    
    #predictors = np.array(predictors)
    #labels = np.array(labels)
    return predictors, labels, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences,total_words,pre_params)

In [5]:
for i in range(len(predictors)):
    predictors[i]= predictors[i].tolist()
    predictors[i].append(pre_params[i])

### Create Model

In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    input_text = Input(shape=(input_len,))
    
    # Define the embedding layer for the text input
    x = Embedding(total_words, 10, input_length=input_len)(input_text)

    # Define the LSTM layer
    x = LSTM(100)(x)
    x = Dropout(0.1)(x)

    # Define the output layer
    output = Dense(total_words, activation='softmax')(x)

    # Define the model
    model = Model(inputs=input_text, outputs=output)

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

In [7]:
def create_model_params(max_sequence_len, total_words, num_params):
    input_len = max_sequence_len - 1
    input_text = Input(shape=(input_len,))
    input_params = Input(shape=(num_params,))
    # Define the embedding layer for the text input
    embedding = Embedding(total_words, 10, input_length=input_len)(input_text)

    # Define the LSTM layer
    lstm = LSTM(100)(embedding)
    dropout = Dropout(0.1)(lstm)

    # Concatenate the LSTM output with the input parameters
    concat = Concatenate()([dropout, input_params])

    # Define the output layer
    output = Dense(total_words, activation='softmax')(concat)

    # Define the model
    model = Model(inputs=[input_text, input_params], outputs=output)

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

In [8]:
model = create_model_params(max_sequence_len, total_words, 6)
model.summary()

test_scores = model.fit([predictors, pre_params], label, epochs=50, verbose = 1)
print(test_scores)

2023-04-11 18:31:17.642401: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-11 18:31:17.677073: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-11 18:31:17.677619: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-11 18:31:17.678602: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 244)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 244, 10)      11230       ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 100)          44400       ['embedding[0][0]']              
                                                                                                  
 dropout (Dropout)              (None, 100)          0           ['lstm[0][0]']                   
                                                                                              

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {\'(<class \\\'list\\\'> containing values of types {\\\'(<class \\\\\\\'list\\\\\\\'> containing values of types {"<class \\\\\\\'int\\\\\\\'>"})\\\', "<class \\\'numpy.ndarray\\\'>"})\'})', "<class 'numpy.ndarray'>"}), (<class 'list'> containing values of types {"<class 'numpy.ndarray'>"})

### Serialize Model

In [None]:
with open('nlp_model.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()
    
with open('max_sequence_len.pkl', 'wb') as f:
    pickle.dump(max_sequence_len, f)
    f.close()

### Load Model

In [None]:
with open('nlp_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('max_sequence_len.pkl', 'rb') as f:
    max_sequence_len = pickle.load(f)

### Generate Text 

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted=np.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if predicted == index:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
def generate_text_params(model, tokenizer, max_sequence_len, seed_text, num_gen_words, params):
    # Preprocess the parameters
    params = preprocess_params(params)
    # Generate words one by one
    generated = ""
    for i in range(num_gen_words):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # Pad the tokenized sequence to the desired length
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # Convert the parameters to a numpy array
        params_arr = np.array([params])
        # Generate the next word using the model and parameters
        probs = model.predict([token_list, params_arr])[0]
        y_pred = np.random.choice(range(1, total_words), p=probs)
        # Convert the predicted word back to a string
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == y_pred:
                output_word = word
                break
        # Append the predicted word to the generated text
        generated += " " + output_word
        # Update the seed text with the predicted word
        seed_text += " " + output_word
    return generated.strip()

In [None]:
print(generate_text("hourly precipitation", 200, model, max_sequence_len))