# Recommender System with Deep Learning

## Setup

### Packages & Parameters

In [2]:
import os
import sys
import gc
import psutil
import time
import typing

import urllib.request
import multiprocessing as mp
import concurrent.futures

import gzip
import unicodedata
import math
import re

import pandas as pd
import numpy as np
from matplotlib import pyplot

from flair.models import TextClassifier
from flair.data import Sentence

from sklearn.preprocessing import QuantileTransformer
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


### Globals

In [3]:
MAIN_DIR = os.path.join("D:" + os.sep, "Code", "PYTHON", "Amazon_Recommender_System")

CODE_DIR = os.path.join(MAIN_DIR, "Code")

ANALYSIS_DIR = os.path.join(MAIN_DIR, "Analysis")
DATA_DIR = os.path.join(MAIN_DIR, "Data")

RAW_DATA_DIR = os.path.join(DATA_DIR, "Raw")
CLEAN_DATA_DIR = os.path.join(DATA_DIR, "Clean")

BOOKS_RAW_DIR = os.path.join(RAW_DATA_DIR, "Books")
BOOKS_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Books")
BOOKS_SENTIMENT_DIR = os.path.join(BOOKS_CLEAN_DIR, "Sentiment")

CHUNK_SIZE = 1000000

NUM_CORES = math.ceil(mp.cpu_count()/2)

### Directory Setup

In [4]:
def create_directory(dir_list: list) -> None:
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def is_directory_empty(dir_path: str) -> bool:
    return len(os.listdir(dir_path)) == 0

In [5]:
create_directory([CODE_DIR, 
                  ANALYSIS_DIR,
                  DATA_DIR, 
                  RAW_DATA_DIR, 
                  CLEAN_DATA_DIR,
                  BOOKS_RAW_DIR,
                  BOOKS_CLEAN_DIR,
                  BOOKS_SENTIMENT_DIR,
                  ])

## Data Processing

### Loading Data In

In [6]:
# More memory efficient version. Takes longer but is far more consistent than the previous version. 
def process_data(url: str, 
                 chunk_size: int, 
                 num_workers: int,
                 output_dir: str) -> None:
    
    with urllib.request.urlopen(url) as response:
        with gzip.open(response, "rt") as gz_file:
            chunk = []
            chunk_count = 0

            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
                for line in gz_file:
                    chunk.append(line)

                    if len(chunk) == chunk_size:
                        executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))
                        chunk = []
                        chunk_count += 1

                # Process the remaining lines in the last chunk
                if chunk:
                    executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))

def process_chunk(chunk: list,
                  filename: str) -> None:
    
    with open(filename, 'w') as file:
        file.writelines(chunk)
    print(f"Processed chunk: {filename}")

In [None]:
'''
----- PROCESS_DATA -----
GAME URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Video_Games_5.json.gz"
BOOK URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

EXPERIMENTS FOR 27 MILLION
----- THREAD CSV_FILE -----
1. 100000 chunk -> 22min 26sec

----- THREAD JSON_FILE RUNTIMES-----
1. 100000 chunk -> 20min 55sec
2. 100000 chunk -> 49min 1sec -> No idea why this happened on a fresh start
3. 500000 chunk -> 18min 45sec
4. 1000000 chunk -> 17min 5sec

NEW STABLE IMPLEMENTATION
1. 1000000 chunk -> 22min 22sec
'''
url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"
process_data(url, CHUNK_SIZE, NUM_CORES, BOOKS_RAW_DIR)

### Cleaning Data

In [7]:
def extract_value(dictionary: dict):
    return dictionary.get("Format:") if isinstance(dictionary, dict) else None

def filter_comment_length(reviews: pd.DataFrame, 
                          minimum: int) -> pd.DataFrame:
    reviews = reviews.copy()
    reviews["review_len"] = reviews["review_text"].str.split().str.len()
    reviews = reviews.loc[(reviews["review_len"] > minimum)]
    return reviews

def remove_symbols(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"\s+|[^a-zA-Z0-9\s]", "", x)) # Cleans up duplicate space and special characters.

def remove_irrelevant_info(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"http\S+|www.\S+|#\S+|<.*?>|\(|\)|\d+", "", x)) # Cleans up URL, hashtags, parenthesis, and numbers.

def reduce_characters(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"[^\w\s]|(.)\1+", "", x)) # Remove excessive punctuation and repeated characters

def normalize_encoding(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ASCII", "ignore").decode("utf-8")) # Encodes and Decodes the data so that we have consistency in text
        
def clean_chunk(df: pd.DataFrame) -> pd.DataFrame:

    df = df.rename(columns={
        "overall": "rating",
        "reviewTime": "review_date",
        "reviewerID": "reviewer_id",
        "asin": "product_id",
        "reviewText": "review_text",
    })

    df["vote"] = df["vote"].fillna(0)
    df["vote"] = pd.to_numeric(df["vote"].astype("str").str.replace(",","")).astype("int32")
    df = df[df["vote"] >= 5]
    df = filter_comment_length(df, 30)
    df.drop(["unixReviewTime", "image", "summary", "reviewerName"], axis=1, inplace=True)
    
    df["review_date"] = pd.to_datetime(df["review_date"], format="%m %d, %Y")
    df["style"] = df["style"].apply(extract_value)

    df["review_text"] = df["review_text"].str.lower()
    df["review_text"] = remove_irrelevant_info(df["review_text"])
    df["review_text"] = reduce_characters(df["review_text"])
    df["review_text"] = normalize_encoding(df["review_text"])
    return df

In [None]:
raw_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(BOOKS_RAW_DIR)))]

for raw_file_name in raw_file_names:
    raw_df_list = []
    raw_df = pd.read_json(os.path.join(BOOKS_RAW_DIR, raw_file_name), lines=True)
   
    raw_df = clean_chunk(raw_df)
    print(f"{raw_file_name} DIMENSIONS: {raw_df.shape}")
    clean_dir = os.path.join(BOOKS_CLEAN_DIR, raw_file_name)
    raw_df.to_json(clean_dir, orient="records")

    del raw_df
    gc.collect()

## Sentiment Analysis

In [8]:
def batch_text(column: pd.Series, 
               batch_size: int) -> list:
    
    batches = np.array_split(np.array(column.values), int(np.ceil(len(column)/ batch_size)))
    batches = [batch.tolist() for batch in batches]
    return batches

def load_sentiment_model(model: str):

    return TextClassifier.load(model)

def is_float(value):
      
      try:
        float(value)
        return True
      except ValueError:
        return False

def analyze_sentiment(model, 
                      batch: np.ndarray):
    
    score = []
    for sentence in batch:
        sentence = Sentence(sentence)
        model.predict(sentence, verbose=False)
        try:
            process = re.sub(r"\(|\)","",str(sentence.labels[0]))
            number = [float(s) for s in  process.split() if is_float(s) is True]
            if "POSITIVE" in process:
                score.append(number[0])
            elif "NEGATIVE" in process:
                score.append(-number[0])
        except IndexError:
            print(sentence)
            score.append(np.nan)
    return score
   
def process_batches(models: list[str], 
                    batches: list):
                    
    models_loaded = [load_sentiment_model(model) for model in models]
    print("[Starting process...]")
    sentiment_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers= NUM_CORES) as executor:
        sentiment_tasks = [executor.submit(analyze_sentiment, model, batch) for model, batch in zip(models_loaded, batches)]
        for completed_task in concurrent.futures.as_completed(sentiment_tasks):
            result = completed_task.result()
            sentiment_results.extend(result)
            print("[Finished with a Batch]")
        return sentiment_results

In [None]:
clean_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(BOOKS_CLEAN_DIR)))]

for clean_file_name in clean_file_names[4:5]:
    clean_df = pd.read_json(os.path.join(BOOKS_CLEAN_DIR, clean_file_name), orient="columns")
    batch_size = math.ceil(len(clean_df)/NUM_CORES) 
       
    batches = batch_text(clean_df["review_text"], batch_size)
    clean_df["sentiment_score"] = process_batches(["en-sentiment"] * len(batches), batches)
    sentiment_dir = os.path.join(BOOKS_SENTIMENT_DIR, clean_file_name)
    clean_df.to_json(sentiment_dir, orient="records")

## Neural Network Preprocessing

In [9]:
merged_df = pd.DataFrame()
for filename in os.listdir(BOOKS_SENTIMENT_DIR):
    sentiment_df = pd.read_json(os.path.join(BOOKS_SENTIMENT_DIR, filename), orient="columns")
    sentiment_df.drop(columns=["review_text"], inplace=True)
    merged_df = pd.concat([merged_df, sentiment_df])
    
merged_df.reset_index(drop=True, inplace=True)
merged_df.head()

Unnamed: 0,rating,verified,review_date,reviewer_id,product_id,style,vote,review_len,sentiment_score
0,4,False,1087776000000,A2NJO6YE954DBH,1712799,Hardcover,6,298,0.9999
1,5,False,982972800000,A1K1JW1C5CUSUZ,1712799,Hardcover,23,431,0.9937
2,3,False,1089244800000,A1JS302JFHH9DJ,2006448,Hardcover,9,273,0.9999
3,5,True,1079049600000,A26QTCZG2XR3JH,2006448,Hardcover,15,277,-0.6053
4,5,False,1071360000000,A36X9BU9JB8KCE,2006448,Hardcover,5,495,0.9994


In [10]:
ranges = [(-1.0000, -0.5000), (-0.5000, 0.0000), (0.0000, 0.5000), (0.5000, 1.0000)] # Define the ranges

counts = {f"{r[0]} to {r[1]}": 0 for r in ranges} # Initialize a dictionary to store the counts

# Count the values within each range
for value in merged_df["sentiment_score"]:
    for r in ranges:
        if r[0] <= value < r[1]:
            counts[f"{r[0]} to {r[1]}"] += 1

for r, count in counts.items():
    print(f"Range {r}: {count}")

Range -1.0 to -0.5: 198373
Range -0.5 to 0.0: 4
Range 0.0 to 0.5: 0
Range 0.5 to 1.0: 192154


In [14]:
def quantile_transformation(data: np.ndarray, 
                            type: typing.Literal["uniform","normal"]) -> np.ndarray:
    
    qt = QuantileTransformer(output_distribution=type)
    return  qt.fit_transform(data)

merged_df.dropna(inplace=True)

def create_dataset(df:pd.DataFrame, ratings_column: str):
    unique_users = df["reviewer_id"].unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = df["reviewer_id"].map(user_to_index)

    unique_items = df["product_id"].unique()
    item_to_index = {old: new for new, old in enumerate(unique_items)}
    new_items = df["product_id"].map(item_to_index)

    n_users = unique_users.shape[0]
    n_items = unique_items.shape[0]
    
    X = pd.DataFrame({"user_id": new_users, "item_id": new_items})
    y = df[ratings_column].astype(np.float32)
    return (n_users, n_items), (X, y)

In [15]:
sentiment_score = np.array(merged_df["sentiment_score"]).reshape(-1,1) # Needs to be 2D to use with QuantileTransformer
merged_df["sentiment_uq"] = quantile_transformation(sentiment_score, "uniform") # Reshape into uniform distribution -> Everything will be equally weighted
merged_df["sentiment_nq"] = quantile_transformation(sentiment_score, "normal") # reshape into normal distribution -> will create bias towards average values -0.5 to 0.5

(n_users, n_items), (X, y) = create_dataset(merged_df, "sentiment_score")
print(f"Embeddings: {n_users} users, {n_items} items")
print(f"Dataset shape: {X.shape}")
print(f"Target shape: {y.shape}")

Embeddings: 165643 users, 50565 items
Dataset shape: (389012, 2)
Target shape: (389012,)


In [None]:
# def svds_dot(U: np.ndarray, 
#              Sigma: np.ndarray, 
#              VT: np.ndarray, 
#              num_rows: int, 
#              num_cols: int, 
#              embedding_length: int, 
#              chunk_size: int) -> np.ndarray:

#     pred_matrix = sp.lil_matrix((num_rows, num_cols)) # This reduces the memory needed to create a matrix with large dimensions

#     for i in range(0, num_rows, chunk_size):
#         for j in range(0, num_cols, chunk_size):
#             for k in range(0, embedding_length, chunk_size):
#                 u_chunk = U[i:i+chunk_size, k:k+chunk_size]
#                 sigma_chunk = Sigma[k:k+chunk_size]
#                 vt_chunk = VT[k:k+chunk_size, j:j+chunk_size]

#                 pred_matrix[i:i+chunk_size, j:j+chunk_size] += np.einsum("ik,kk,kj->ij", u_chunk, np.diag(sigma_chunk), vt_chunk)
#     return pred_matrix

In [None]:
# sparse_matrix = sp.csr_matrix((merged_df["rating"], 
#                                (merged_df["reviewer_index"], merged_df["product_index"])), 
#                                shape=(len(merged_df["reviewer_index"]), len(merged_df["product_index"])), 
#                                dtype=np.float64)

# embedding_length = 64
# U, Sigma, VT = svds(sparse_matrix, k = embedding_length)

# num_rows, num_cols = sparse_matrix.shape
# chunk_size = 10000

In [None]:
# pred_matrix = svds_dot(U, Sigma, VT, num_rows, num_cols, embedding_length, chunk_size)

In [None]:
# scaler = MinMaxScaler()
# row_indices, column_indices = sparse_matrix.nonzero()
# pred_matrix = scaler.fit_transform(pred_matrix)
# pred_matrix = pd.DataFrame(pred_matrix, index = row_indices, columns = column_indices)

In [None]:
class MatrixFactorization(nn.Module):
        def __init__(self,
                     n_users,
                     n_items,
                     n_factors=40,
                     dropout_p=0.02,
                     sparse=False):
                
                super(MatrixFactorization, self).__init__()
                self.n_users = n_users
                self.n_items = n_items
                self.n_factors = n_factors

                self.user_biases = nn.Embedding(n_users, 1, sparse=sparse)
                self.item_biases = nn.Embedding(n_items, 1, sparse=sparse)
                self.user_embeddings = nn.Embedding(n_users, n_factors, sparse=sparse)
                self.item_embeddings = nn.Embedding(n_items, n_factors, sparse=sparse)

                self.dropout_p = dropout_p
                self.dropout = nn.Dropout(p=self.dropout_p)

                self.sparse = sparse

        def forward(self, users, items):
                user_embedding = self.user_embeddings(users)
                item_embedding = self.item_embeddings(items)
                predictions = self.user_biases(users)
                predictions += self.item_biases(items)
                predictions += (self.dropout(user_embedding) * self.dropout(item_embedding)).sum(dim=1, keepdim=True)
                return predictions
        
        def __call__(self, *args):
                return self.forward(*args)
        
        def predict(self, users, items):
                return self.forward(users, items)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 22)
merged_df 

model = MatrixFactorization(n_users, n_items, n_factors = 64)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=e26)

In [None]:
# Feed-Forward Network as defined in Zhou Xu 2016
class DenseNet(nn.Module):
    def __init__(self,
                 n_users,
                 n_items,
                 n_factors=64,
                 hidden_dim,
                 dropout_p=20,
                 sparse=False,
                 output_dim):
        
        super(DenseNet, self).__init__()

        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors

        self.user_biases = nn.Embedding(n_users, 1, sparse=sparse)
        self.item_biases = nn.Embedding(n_items, 1, sparse=sparse)
        self.user_embeddings = nn.Embedding(n_users, n_factors, sparse=sparse)
        self.item_embeddings = nn.Embedding(n_items, n_factors, sparse=sparse)

        self.dropout_p = dropout_p
        self.dropout = nn.Dropout(p=self.dropout_p)
        
        self.sparse=sparse

        # Define the layers
        self.linear1 = nn.Linear(n_factors*2, hidden_dim)
        self.dropout1 = nn.Dropout(0.2)
        self.bn1 = nn.BatchNorm1d(hidden_dim)

        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout2 = nn.Dropout(0.2)
        self.bn2 = nn.BatchNorm1d(hidden_dim)

        self.linear3 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout3 = nn.Dropout(0.2)
        self.bn3 = nn.BatchNorm1d(hidden_dim)

        self.linear4 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout4 = nn.Dropout(0.2)
        self.bn4 = nn.BatchNorm1d(hidden_dim)

        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout5 = nn.Dropout(0.2)
        self.bn5 = nn.BatchNorm1d(hidden_dim)

        self.linear6 = nn.Linear(hidden_dim, output_dim)

    def forward(self, users, items):
        users_embedding = self.user_embeddings(users)
        items_embedding = self.item_embeddings(items)
	
        x = torch.cat([users_embedding, items_embedding], 1) # concatenate user and item embeddings to form input
       
        h1 = self.bn1(self.dropout1(F.relu(self.linear1(x))))  # Layer 1: ReLU(W(1)x + b1)

        h2 = self.bn2(self.dropout2(torch.tanh(self.linear2(h1)))) # Layer 2: tanh(W(2)h(1) + b2)

        h3 = self.bn3(self.dropout3(F.relu(self.linear3(h2)))) # Layer 3: ReLU(W(3)h(2) + b3)

        h4 = self.bn4(self.dropout4(torch.sigmoid(self.linear4(h3)))) # Layer 4: Sigmoid(W(4)h(3) + b4)

        h5 = self.bn5(self.dropout5(F.relu(self.linear5(h4)))) # Layer 5: ReLU(W(5)h(4) + b5)

        output = F.softmax(self.linear6(h5), dim=1) # Output layer: softmax(Uh(5) + b6)

        return output

In [None]:
# LSTM as defined in Zhou Xu 2016
class LSTM_Rating(nn.Module):
    def __init__(self,
                 embedding_dim, 
                 hidden_dim, 
                 n_users, 
                 n_items, 
                 n_factors = 64, 
                 n_output, 
                 sparse):
        
        super(LSTM_Rating, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors

        self.user_biases = nn.Embedding(n_users, 1, sparse=sparse)
        self.item_biases = nn.Embedding(n_items, 1, sparse=sparse)
        self.user_embeddings = nn.Embedding(n_users, n_factors, sparse=sparse)
        self.item_embeddings = nn.Embedding(n_items, n_factors, sparse=sparse)

        self.sparse = sparse
        # Input gate
        self.Wu = nn.Linear(hidden_dim, hidden_dim)
        self.Iu = nn.Linear(input_size, hidden_dim)

        # Forget gate
        self.Wf = nn.Linear(hidden_dim, hidden_dim)
        self.If = nn.Linear(input_size, hidden_dim)

        # Output gate
        self.Wo = nn.Linear(hidden_dim, hidden_dim)
        self.Io = nn.Linear(input_size, hidden_dim)

        # New memory cell
        self.Wc = nn.Linear(hidden_dim, hidden_dim)
        self.Ic = nn.Linear(input_size, hidden_dim)

    def forward(self, xt, ht_1, ct_1):
    
        gu = torch.sigmoid(self.Wu(ht_1) + self.Iu(xt)) # Input gate

        gf = torch.sigmoid(self.Wf(ht_1) + self.If(xt)) # Forget gate

        go = torch.sigmoid(self.Wo(ht_1) + self.Io(xt)) # Output gate

        gc = torch.tanh(self.Wc(ht_1) + self.Ic(xt)) # New memory cell

        ct = gf * ct_1 + gu * gc  # Final memory cell

        ht = torch.tanh(go * ct) # Final hidden state

        return ht, ct

In [None]:
    U, S, Vh = torch.linalg.svd(A, full_matrices=False)
    x = torch.dist(A, U @ torch.diag(S) @ Vh)