Matrix structure:
```
++++   ++++
+X0+   +X1+
++++   ++++

X0 - books x word emb
X1 - books x cnn emb

E0 - books
E1 - word emb
E2 - cnn emb
```

In [None]:
# change the input cnn embeddings - to ResNet/GoogleNet/INceptionNet/VGG
# try reducing learning error for matrix factorization
# try diff regressors
# try multi label classification
# visualization of embeddings

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.functional as F
import pickle

In [None]:
dataset_folder = '../dataset/'
X0_file = dataset_folder + "sentenceBert_emb_tensor.pt"
X1_file = dataset_folder + "cnn_encoder_decoder_tensor.pt" #"googleNet_cnn_emb_tensor.pkl"

In [None]:
X0 = torch.load(X0_file)
print(X0.size())
X1 = torch.load(X1_file)
print(X1.size())

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super().__init__()
        self.enc_linear1 = nn.Linear(input_dim, 128)
        self.enc_linear2 = nn.Linear(128, embedding_dim)
        self.dec_linear1 = nn.Linear(embedding_dim, 128)
        self.dec_linear2 = nn.Linear(128, input_dim)
        self.emb = None
        
    def forward(self, x):
        x = self.enc_linear1(x)
        x = torch.relu(x)
        x = self.enc_linear2(x)
        self.emb = x # return embedding from encoder
        x = torch.relu(x)
        x = self.dec_linear1(x)
        x = torch.relu(x)
        x = self.dec_linear2(x)
        return x # use x for training
    

In [None]:
class matrix_factorization():
    def __init__(self, matrices, entity_list, matrix_entity_mapping, emb_dim):
        self.matrices = matrices
        self.entity_list = entity_list
        self.matrix_entity_mapping = matrix_entity_mapping # {"E0": ["X0", "X1"], "E1": ["X0"], "E2":["X1"]}
        self.emb_dim = emb_dim
        self.autoencoders = {} # {"E0": E0_autoencoder, "E1": E1_ae, ...}
        self.reconstructed_matrices = {} # {"X0": recon_X0, "X1": recon_X1, ...}
        self.embeddings = {} # {"E0": E0_emb, "E1": E1_emb, ...}
        self.concatenated_matrices = []
        self.optim = None
        self.criterion = nn.MSELoss()
        self.batch_size = 50
        self.convergence_threshold = 1e-4
        self.learning_rate = 0.000001
        self.epoch_count = 250
        
    def init_autoencoders(self):
        # initialize autoencoder - one for each entity
        for entity, matrices in matrix_entity_mapping.items():
            if entity == "E0":
                C_E0 = torch.cat((matrices[0], matrices[1]), dim = 1)
                print(C_E0.size())
                E0_aec = Autoencoder(C_E0.size(1), self.emb_dim)
            elif entity == "E1":
                C_E1 = torch.transpose(matrices, 0, 1)
                print(C_E1.size())
                E1_aec = Autoencoder(C_E1.size(1), self.emb_dim)
            elif entity == "E2":
                C_E2 = torch.transpose(matrices, 0, 1)
                print(C_E2.size())
                E2_aec = Autoencoder(C_E2.size(1), self.emb_dim)
                
        self.concatenated_matrices = {"E0": C_E0, "E1": C_E1,"E2": C_E2}
        self.autoencoders = {"E0": E0_aec, "E1": E1_aec, "E2": E2_aec}
        self.optim = torch.optim.SGD(list(E0_aec.parameters()) + list(E1_aec.parameters()) + list(E2_aec.parameters()), lr = self.learning_rate)
    
    def train_autoencoder(self):
        # training
        prev_losses = []
        for epoch in range(0,self.epoch_count):
            shuffled_indices = {}
            avg_loss = {}
            ent_emb = {}
            for e in self.autoencoders.keys():
                a = np.arange(0 , len(self.concatenated_matrices[e]))
                shuffled_indices[e] = torch.LongTensor(a) #torch.randperm(self.concatenated_matrices[e].size(0))
                ent_emb[e] = torch.zeros(self.concatenated_matrices[e].size(0), self.emb_dim)
            
            for e in self.concatenated_matrices.keys():
                total_loss = 0
                num_batches = 0
                for count in range(0, self.concatenated_matrices[e].size(0), self.batch_size):
                    indices = shuffled_indices[e][count:count+self.batch_size] 
                    minibatch = self.concatenated_matrices[e][indices]
                    output = self.autoencoders[e](minibatch)
                    ent_emb[e][indices] = self.autoencoders[e].emb # assign emb of the mini batch to entity
#                     print(ent_emb[e][indices[0]])
                    loss = self.criterion(minibatch, output)
                    num_batches += 1
                    total_loss += loss
                avg_loss[e] = total_loss/num_batches
    
#             print(ent_emb['E0'][0])
            aec_loss = 0
    
            for v in avg_loss.values():
                aec_loss += v
#             print(f"Aec {aec_loss}")
            self.reconstructed_matrices['X0'] = torch.matmul(ent_emb['E0'], torch.transpose(ent_emb['E1'], 0, 1))
            self.reconstructed_matrices['X1'] = torch.matmul(ent_emb['E0'], torch.transpose(ent_emb['E2'], 0, 1))
            recon_loss = self.criterion(self.reconstructed_matrices['X0'], self.matrix_entity_mapping["E1"]) + \
                        self.criterion(self.reconstructed_matrices['X1'], self.matrix_entity_mapping["E2"])
#             print(f"recon loss {recon_loss}")
            aec_loss += recon_loss
#             print(f"Total {aec_loss}")
            self.optim.zero_grad()
            aec_loss.requires_grad_(True)
            aec_loss.backward()
            self.optim.step()

            if epoch % 10 == 0:
                print(f"Average loss for epoch {epoch} = {aec_loss}")
            if  (epoch > 100) and (len(prev_losses) > 0) and (prev_losses[-1] - aec_loss < self.convergence_threshold):
                print('Convergence!')
                break
            prev_losses.append(aec_loss)
        
    def get_embeddings(self):
        for e in self.matrix_entity_mapping.keys():
            out = self.autoencoders[e](self.concatenated_matrices[e])
            self.embeddings[e] = self.autoencoders[e].emb
        return self.embeddings
 

In [None]:
matrices = ["X0", "X1"]
entity_list = ["E0", "E1", "E2"]
matrix_entity_mapping = {"E0": (X0, X1), "E1": (X0), "E2":(X1)}
emb_dim = 50

model = matrix_factorization(matrices, entity_list, matrix_entity_mapping, emb_dim)
model.init_autoencoders()
model.train_autoencoder()
embeddings = model.get_embeddings()

In [None]:
# with open('embeddings.pkl', 'wb') as handle:
#     pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
torch.save(embeddings, "sentenceBert_cnn_encoder_decoder.pkl")

In [None]:
# Supervised learning

In [None]:
train_ids = torch.load("../dataset/train_book_id.pkl")
test_ids = torch.load("../dataset/test_book_id.pkl")
print(train_ids.size())
print(test_ids.size())

In [None]:
df = pd.read_csv("../dataset/books_with_genres.csv")
df.head()

In [None]:
df[df.book_id.isin(train_ids.tolist())]

In [None]:
X_train = embeddings["E0"][0:4000].detach().numpy()
X_test = embeddings["E0"][4000:].detach().numpy()
print(X_train.shape)
print(X_test.shape)

In [None]:
y_train = df["average_rating"][0:4000]
y_test = df["average_rating"][4000:]
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
lr = LinearRegression()
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
error = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

In [None]:
print(error)
print(r2)

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
preds = svr.predict(X_test)
error = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

In [None]:
print(error)
print(r2)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=42)
df_pca = pd.DataFrame(pca.fit_transform(X_train), columns = ["comp1", "comp2"])
df_pca["y_train"] = y_train 
print(df_pca.shape)
df_pca.head()

In [None]:
df_pca["y_cat"] = df_pca["y_train"].apply(lambda x: int(x))

In [None]:
df_pca.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.scatterplot(x="comp1", y="comp2", data=df_pca, hue="y_cat")