In [None]:
import math
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Lambda, Dense, Dropout, Input, Layer, TextVectorization, concatenate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

import math

from sklearn.feature_extraction.text import HashingVectorizer

from keras.datasets import mnist
from keras.losses import mse, binary_crossentropy
from keras import backend as K
from keras.regularizers import l2
from keras.callbacks import Callback

from tensorflow.python.ops import nn
import tensorflow as tf

import numpy as np
import argparse
import os
import random
from math import log
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from scipy.spatial import distance
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, precision_score, recall_score


import nltk
from nltk.stem import SnowballStemmer
from keras.utils import plot_model


In [None]:
# Stock data is the dataset which contains the details of stocks. Main features: stock symbol, buisness name, buisness summary, GICS code
stock_data.shape

In [None]:
stock_data = stock_data.fillna('unknown')

### Clean buisness summary

In [None]:
def clean_text(
    string: str,
    ) -> str:

    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will','&','of','for','are']

    """
    A method to clean text
    """

    # Removing the punctuations
    for x in string.lower():
        if x in punctuations:
            string = string.replace(x, "")

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])


    # stemming
    snowball = SnowballStemmer(language='english')

    str_list=string.split()
    newStr=''
    for i in str_list:
        newStr = newStr+snowball.stem(i)+' '


    return newStr

In [None]:
stock_data.buisnesssummary = stock_data.buisnesssummary.apply(clean_text)

In [None]:
stock_data['symbol'] = stock_data['symbol'].str.replace(r"(","")
stock_data['symbol'] = stock_data['symbol'].str.replace(r")","")


In [None]:
#Vectorizer to embed text data

vectorizer = HashingVectorizer(n_features=5000,norm=None,alternate_sign=False)
vectorized_sentenses = vectorizer.fit_transform(text_data)



In [None]:
# Used this function to convert both buisness name and summary in a row to vector using vectorizer
def row_to_vector(dataframe, text_columns, id_column):

  output_dataframe = pd.DataFrame(columns = [id_column,'vector'])
  column_list = dataframe.columns.to_list()
  text_column_pos = []

  for column in text_columns:
    text_column_pos.append(column_list.index(column))

  text_column_pos.sort(reverse=True)

  for i in range(dataframe.shape[0]):
    row = (dataframe.iloc[i].to_list())[0:3]
    row_id = row[(column_list.index(id_column))]

    for pos in text_column_pos:
      text = row[pos]
      text_vector = vectorizer.transform([text]).toarray().flatten().tolist()
      row = row[:pos] + text_vector + row[pos+1:]
    row.remove(row_id)
    output_dataframe.loc[i] = [row_id, row]
  return output_dataframe

In [None]:
stock_data_vectors = row_to_vector(stock_data, ['name','buisnesssummary'], 'symbol')

# Variational Auto Encoder

In [None]:
input_shape = np.array(stock_data_vectors['vector'][0]).shape
print('vector size(hashing vectorized name + hashing vectorized buisness summary + one hot encoded gics codes):',input_shape[0])

In [None]:
# input_dim = 2333
latent_dim = 50

epochs = 20
decay = 1e-4
bias = True


input_dim = input_shape[0]
# input_dim = len(stock_data_vectors['vector'][0])

In [None]:
#Encoder (Functional model)
encoder_input = Input(shape = input_dim, name = 'encoder_input')

encoder_layer1 = Dense(2048, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=bias, activation='relu')(encoder_input)

encoder_layer2 = Dense(512, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=bias, activation='relu')(encoder_layer1)

encoder_layer3 = Dense(128, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=bias, activation='relu')(encoder_layer2)

z_mean = Dense(latent_dim, name = 'z_mean')(encoder_layer3)
z_log_var = Dense(latent_dim, name = 'z_log_variance')(encoder_layer3)

z = Sampling()([z_mean, z_log_var])

encoder = Model(encoder_input, [z_mean,z_log_var, z], name = 'encoder')
encoder.summary()


In [None]:
#Decoder

decoder_input = Input(shape = (latent_dim,), name = 'decoder_input')
decoder_layer1 = Dense(128, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=bias, activation='relu')(decoder_input)

decoder_layer2 = Dense(512, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=bias, activation='relu')(decoder_layer1)

decoder_layer3 = Dense(2048, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=bias, activation='relu')(decoder_layer2)

decoder_output = Dense(input_dim,  activation = 'sigmoid')(decoder_layer3)

decoder = Model(decoder_input, decoder_output, name = 'decoder')
decoder.summary()

In [None]:
plot_model(encoder, show_layer_names = False, show_shapes = True, dpi = 60)

In [None]:
plot_model(decoder, show_layer_names = False, show_shapes = True, dpi = 60)

In [None]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")


    def call(self, inputs):
        _,_,z = self.encoder(inputs)
        return self.decoder(z)


    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "total_loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [None]:
train_data = (stock_data_vectors['vector'].values).tolist()

In [None]:
train_data_arr = np.array(train_data)

In [None]:
# train_data_tensor = tf.convert_to_tensor(train_data_arr, dtype=tf.int64)
train_data_tensor = np.asarray(train_data_arr).astype(np.int)

In [None]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())

In [None]:
history = vae.fit(train_data_arr, epochs=15, batch_size=16, shuffle=True)

In [None]:
encoder_outputs = vae.encoder.predict(train_data_arr)

In [None]:
encoded_vectors_text = encoder_outputs[-1].tolist()

In [None]:
# ///weight calculation for GICS code

mx=[]
mn=[]
for i in range(0,len(encoded_vectors_text)):
  mx.append(sorted(encoded_vectors_text[i])[-1])
  mn.append(sorted(encoded_vectors_text[i])[0])
min_value = sorted(mn)[0]
mx.append(-1*min_value)

max_value = sorted(mx)[-1]
weight = math.ceil(max_value)

weight

In [None]:
# Final vector for a stock
row = []
for i in range(stock_data.shape[0]):
    gics=((stock_data.iloc[i].to_list())[3:])
    weighted_gics = [i * weight for i in gics]
    row.append(encoded_vectors_text[i]+weighted_gics)


In [None]:
size=len(stock_data_vectors.iloc[0].at['encoded_vectors'])

def reshape_vectors(vector):
  arr = np.array(vector).reshape(1,size)
  return arr

In [None]:
stock_data_vectors['encoded_vectors'] = stock_data_vectors['encoded_vectors'].apply(lambda x : reshape_vectors(x))

In [None]:
stock_data_vectors['symbol'] = stock_data_vectors['symbol'].apply(lambda x : x.split(".")[0])

In [None]:

encoded_vectors = row
len(encoded_vectors)

In [None]:
# Calculating Cosine Similarity between Stocks
cos_sim_data = pd.DataFrame(cosine_similarity(encoded_vectors), index = stock_data_vectors['symbol'], columns = stock_data_vectors['symbol'])

In [None]:
# Generating recommendations using cosine similarity matrix
# stock_business_code: stock symbol, k: number of recommendation, needed to generated, print_recommendation: need to pring recommendations

def give_recommendations(stock_business_code,k,print_recommendation):
  stock_recomm =cos_sim_data.loc[stock_business_code].sort_values(ascending=False).index.tolist()[1:k+1]
  index_recomm = cos_sim_data.loc[stock_business_code].sort_values(ascending=False).values.tolist()[1:k+1]

  result = dict(zip(stock_recomm, index_recomm))

  if print_recommendation==True:
    print('The prefered stock is : {} \n'.format(stock_business_code))
    k = 1
    for stock in stock_recomm:
      print('The number %i recommended stock is this one: %s \n'%(k,stock))
      k = k+1

  return result