In [8]:
#!pip install Keras
import numpy as np
from numpy import array
from numpy import hstack
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional
from keras import models 
import keras
from scipy.signal import blackman
from ast import literal_eval
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [9]:
import tensorflow as tf
from numpy.random import seed
seed(1)
tf.random.set_seed(1)

In [12]:
class CreateModel():
    
    def original_data_visualization(self,df, word_list, first_period):
        '''
        A function that displays the original tf -idf values for word vectors
        '''
        last_period = len(df.iloc[1,1]) - first_period

        plt.style.use('seaborn-darkgrid')
        # create a color palette
        palette = plt.get_cmap('Set1')

        for i in range(len(word_list)):
            plt.plot(df['tf-idf'][df['word'] == word_list[i]].to_list()[0][first_period:], marker='o',linestyle='dashed', color=palette(i),linewidth=1, alpha=0.9, label=word_list[i])
        # Add legend
        plt.legend(loc=1, ncol=1)

        plt.gcf().set_size_inches(15.5, 10.5, forward=True)  
        plt.savefig('image_data/Chart original data.png') 
        plt.show()
        plt.close()
    
    def words_highest_text_rank(self,df):
        '''
        A function that returns the 10 words with the highest text rank
        '''
        word_list = df['word'][df['word_rank'] <= 10].to_list()
        return word_list
    
    def flat_split_sequences_validation(self,sequences, n_steps):
        '''
        A function that receives sequence and number_steps.
        Distribution of each vector n_steps=50Distribution of each vector n_steps=50 
        and divides the data into train = 60%, test = 20%, validation = 20%.
        '''
        X_train, y_train, X_test, y_test, X_valid, y_valid  = list(), list(), list(), list(), list(), list()

        for word in range(sequences.shape[0]):
            X, y = list(), list()
            for i in range(sequences.shape[1]):
                # find the end of this pattern
                end_ix = i + n_steps
                # check if we are beyond the dataset
                if end_ix > sequences.shape[1]-1:
                    break
                # gather input and output parts of the pattern  
                seq_x, seq_y = sequences[word][i:end_ix], sequences[word][end_ix]
                X.append(seq_x)
                y.append(seq_y)


            # Split vectors for  60% train, 20% test and 20% valid           
            train_split = int(len(X)*0.6)
            val_split = int(len(X)*0.2)

            X_train.append(X[:train_split])
            X_valid.append(X[train_split : (train_split+val_split)])
            X_test.append(X[(train_split+val_split):])

            y_train.append(y[:train_split])
            y_valid.append(y[train_split:(train_split+val_split)])
            y_test.append(y[(train_split+val_split):]) 

        X_train, y_train, X_test, y_test, X_valid, y_valid = array(X_train), array(y_train), array(X_test), array(y_test), array(X_valid), array(y_valid)

        return X_train, y_train, X_test, y_test, X_valid, y_valid 
    
    def get_lenght(self, X_train, X_valid, y_test):
        '''
        Data lengths after splitting data for visualization
        '''
        y_train_lenght = len(X_train[0])
        y_valid_lenght = len(X_valid[0])
        y_test_lenght = len(y_test[0])
        print('y_train lenght :', y_train_lenght,', y_valid lenght :', y_valid_lenght, ', y_test lenght :', y_test_lenght)
        return y_train_lenght, y_valid_lenght, y_test_lenght
    
    def reshape_data(self,X_train, y_train, X_test, y_test, X_valid, y_valid, n_steps):    
        '''
        Reshape data after split input_shape=(50, 1)
        '''   
        X_train = X_train.reshape(X_train.shape[0]*X_train.shape[1],n_steps,1)
        y_train = y_train.reshape(y_train.shape[0]*y_train.shape[1],1)

        X_valid = X_valid.reshape(X_valid.shape[0]*X_valid.shape[1],n_steps,1)
        y_valid = y_valid.reshape(y_valid.shape[0]*y_valid.shape[1],1)

        X_test =  X_test.reshape(X_test.shape[0]*X_test.shape[1],n_steps,1)
        y_test =  y_test.reshape(y_test.shape[0]*y_test.shape[1],1)

        return X_train, y_train, X_test, y_test, X_valid, y_valid 
    
    def get_model_LSTM(self, n_features,n_steps, name_loss):
        '''
        Function that accepts number_features, number_steps, name_loss as arguments
        And creates an LSTM network
        '''
        model = Sequential()
                                                              #input_shape=(50, 1)
        model.add(LSTM(units=20, activation=None,return_sequences=True,input_shape=(n_steps, n_features)))
        model.add(LSTM(20,activation=None,return_sequences=True))
        model.add(LSTM(20,activation=None,return_sequences=False))
        model.add(Dense(n_features))
        model.compile(optimizer='adam', loss=name_loss) 
        return model
    
    def get_model_Bidirectional(self, n_features,n_steps, name_loss):
        '''
        Function that accepts number_features, number_steps, name_loss as arguments
        And creates an Bidirectional network
        '''
        model = Sequential()                                                         #input_shape=(50, 1)
        model.add(Bidirectional(LSTM(20, return_sequences=False, activation='relu'), input_shape=(n_steps, n_features)))
        model.add(Dense(n_features))
        model.compile(optimizer='adam',loss=name_loss)
        return model
    
    def fit_modle(self,model,X_train,y_train,X_valid,y_valid, y_train_lenght):
        '''
        fit the LSTM network
        '''
        n_features = X_train.shape[2]
        vector_size = y_train_lenght
        # fit model                                      
        history = model.fit(X_train, y_train, epochs=3, batch_size = vector_size, shuffle=False,validation_data=(X_valid, y_valid),verbose=1)
        return history
    
    def save_modle(self,name_modle,name_history,history):
        '''
        A function that preserves the model and the values of the loss function
        '''
        model.save(name_modle)
        np.save(name_history+'.npy',history.history)
        
    def load_modle(self,name_modle,name_history):
        '''
        A function that imports the model and the values of the loss function
        '''
        load_history=np.load(name_history+'.npy',allow_pickle='TRUE').item()
        reconstructed_model = models.load_model(name_modle)
        reconstructed_model.summary()
        return reconstructed_model, load_history
    
    def model_loss(self, load_history,name_loss):
        '''
        Function that displays a graph of the results of the loss function
        '''
        plt.plot(load_history['loss'])
        plt.plot(load_history['val_loss'])
        plt.title('model loss')
        plt.ylabel(name_loss+' loss')
        plt.xlabel('epochs')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
        plt.close()
        
    def get_data_predict(self, df00, n_steps):
        '''
        A function that returns the input data for model prediction
        '''
        full_dataset = pd.DataFrame(df00['tf-idf_after_norm'].to_list()).iloc[:,:].values
        # Call to function flat_split_sequences_validation
        full_X_train, full_y_train, full_X_test, full_y_test, full_X_valid, full_y_valid = self.flat_split_sequences_validation(full_dataset, n_steps)
        full_X_train, full_y_train, full_X_test, full_y_test, full_X_valid, full_y_valid= self.reshape_data(full_X_train, full_y_train, full_X_test, full_y_test, full_X_valid, full_y_valid, n_steps)
        return full_X_train, full_y_train, full_X_test, full_y_test, full_X_valid, full_y_valid
    
    def prediction_valid(self,reconstructed_model, full_X_valid):
        '''
        make a prediction valid
        '''
        prediction_valid = reconstructed_model.predict(full_X_valid, verbose=0)
        return prediction_valid
        
    def prediction_test(self,reconstructed_model,full_X_test):
        '''
        make a prediction test
        '''
        prediction_test = reconstructed_model.predict(full_X_test, verbose=0)
        return prediction_test
        
    def predictive_data_into_dataframe(self,df, prediction_valid,prediction_test, y_valid_lenght, y_test_lenght ):
        '''
        A function that puts forecast data into a data frame
        '''
        prediction_test_list = []
        prediction_valid_list = []

        for i in range(len(df['word'].to_list())):

                word_index = df[df['word'] == df['word'].to_list()[i]].index[0]

                valid_start = y_valid_lenght*word_index
                prediction_valid_list.append(prediction_valid[valid_start:valid_start + y_valid_lenght]) 

                test_start = y_test_lenght*word_index
                prediction_test_list.append(prediction_test[test_start:test_start + y_test_lenght])   

        df_prediction = df.copy()

        df_prediction['validate_prediction'] = prediction_valid_list
        df_prediction['test_prediction'] = prediction_test_list
        return df_prediction
    
    def data_visualization_prediction(self, df, word_list, first_period,y_train_lenght,y_valid_lenght,y_test_lenght, n_steps):
        '''
        A function that visualizes the prediction results
        '''
        name_word=[]
        last_period = len(df.iloc[1,1]) - first_period

        plt.style.use('seaborn-darkgrid')
        # create a color palette
        palette, palette2, palette3 = plt.get_cmap('Set1'),plt.get_cmap('tab20'),plt.get_cmap('Dark2')

        x_0 = np.array([i+1 for i in range(len(df.iloc[1,1]))])
        x_1 = np.array([i+1 for i in range(n_steps + y_train_lenght,n_steps + y_train_lenght+y_valid_lenght)])
        x_2 = np.array([i+1 for i in range(n_steps + y_train_lenght+y_valid_lenght,n_steps + y_train_lenght+y_valid_lenght+y_test_lenght)])

        for i in range(len(word_list)):
            plt.plot(df['tf-idf_after_norm'][df['word'] == word_list[i]].to_list()[0][first_period:],linestyle='dashed', color=palette(i),linewidth=1, alpha=0.9, label=word_list[i])
            plt.plot(x_1,df['validate_prediction'][df['word'] == word_list[i]].tolist()[0],linestyle='dashdot',linewidth=1.5, alpha=1,color=palette2(i), label=word_list[i])
            plt.plot(x_2,df['test_prediction'][df['word'] == word_list[i]].tolist()[0],linestyle='solid',linewidth=1, alpha=1,color=palette3(i), label=word_list[i])
            name_word.append(word_list[i])

        # Add legend
        plt.legend(loc=1, ncol=1)

        plt.ylabel('prediction')
        plt.xlabel('w')

        plt.legend(['tf_idf_normalize','validate_prediction','test_prediction'])
        plt.title('Visualization of words prediction-word name: '+" ".join(name_word))

        plt.gcf().set_size_inches(15.5, 10.5, forward=True)  
        plt.show()
        plt.close()
        
    def average_vectors(self,df,number_clusters, name_cluster):
        '''
        Function calculates the average of vectors for each cluster. 
        The calculation is done by schematic each of the columns of the word vector and dividing by 2.
        '''
        dict_data={}
        for number_cluster in range(number_clusters):
            # All data for a single cluster
            df_cluster = df[df[name_cluster] ==number_cluster]
            # Get TF IDF normalization vectors for a single cluster
            dataset = pd.DataFrame(df_cluster['tf-idf_after_norm'].to_list()).iloc[:,:].values

            dict_data.update({'cluster '+str(number_cluster):[sum(list_values)/2 for list_values in zip(*dataset.tolist())]})
        return dict_data