# Recommender System with Deep Learning

## Setup

### Packages & Parameters

In [8]:
import os
import gc
import typing

import urllib.request
import multiprocessing as mp
import concurrent.futures

import gzip
import unicodedata
import math
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from flair.models import TextClassifier
from flair.data import Sentence

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf


### Globals

In [9]:
MAIN_DIR = os.path.join("D:" + os.sep, "Code", "PYTHON", "Amazon_Recommender_System")

CODE_DIR = os.path.join(MAIN_DIR, "Code")

ANALYSIS_DIR = os.path.join(MAIN_DIR, "Analysis")
DATA_DIR = os.path.join(MAIN_DIR, "Data")

RAW_DATA_DIR = os.path.join(DATA_DIR, "Raw")
CLEAN_DATA_DIR = os.path.join(DATA_DIR, "Clean")

BOOKS_RAW_DIR = os.path.join(RAW_DATA_DIR, "Books")
BOOKS_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Books")
BOOKS_SENTIMENT_DIR = os.path.join(BOOKS_CLEAN_DIR, "Sentiment")

CHUNK_SIZE = 1000000

NUM_CORES = math.ceil(mp.cpu_count()/2)

RANDOM_STATE = 1

### Setup

In [10]:
def create_directory(dir_list: list) -> None:
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def is_directory_empty(dir_path: str) -> bool:
    return len(os.listdir(dir_path)) == 0

def set_random_seed(state=1):
    gens = (np.random.seed, tf.random.set_seed)
    for set_state in gens:
        set_state(state)

In [11]:
create_directory([CODE_DIR, 
                  ANALYSIS_DIR,
                  DATA_DIR, 
                  RAW_DATA_DIR, 
                  CLEAN_DATA_DIR,
                  BOOKS_RAW_DIR,
                  BOOKS_CLEAN_DIR,
                  BOOKS_SENTIMENT_DIR,
                  ])

set_random_seed(RANDOM_STATE)

## Data Processing

### Loading Data In

In [12]:
# More memory efficient version. Takes longer but is far more consistent than the previous version. 
def process_data(url: str, 
                 chunk_size: int, 
                 num_workers: int,
                 output_dir: str) -> None:
    
    with urllib.request.urlopen(url) as response:
        with gzip.open(response, "rt") as gz_file:
            chunk = []
            chunk_count = 0

            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
                for line in gz_file:
                    chunk.append(line)

                    if len(chunk) == chunk_size:
                        executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))
                        chunk = []
                        chunk_count += 1

                # Process the remaining lines in the last chunk
                if chunk:
                    executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))

def process_chunk(chunk: list,
                  filename: str) -> None:
    
    with open(filename, 'w') as file:
        file.writelines(chunk)
    print(f"Processed chunk: {filename}")

In [6]:
'''
----- PROCESS_DATA -----
GAME URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Video_Games_5.json.gz"
BOOK URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

EXPERIMENTS FOR 27 MILLION
----- THREAD CSV_FILE -----
1. 100000 chunk -> 22min 26sec

----- THREAD JSON_FILE RUNTIMES-----
1. 100000 chunk -> 20min 55sec
2. 100000 chunk -> 49min 1sec -> No idea why this happened on a fresh start
3. 500000 chunk -> 18min 45sec
4. 1000000 chunk -> 17min 5sec

NEW STABLE IMPLEMENTATION
1. 1000000 chunk -> 22min 22sec
'''
url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"
process_data(url, CHUNK_SIZE, NUM_CORES, BOOKS_RAW_DIR)

KeyboardInterrupt: 

### Cleaning Data

In [13]:
def extract_value(dictionary: dict):
    return dictionary.get("Format:") if isinstance(dictionary, dict) else None

def filter_comment_length(reviews: pd.DataFrame, 
                          minimum: int) -> pd.DataFrame:
    reviews = reviews.copy()
    reviews["review_len"] = reviews["review_text"].str.split().str.len()
    reviews = reviews.loc[(reviews["review_len"] > minimum)]
    return reviews

def remove_symbols(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"\s+|[^a-zA-Z0-9\s]", "", x)) # Cleans up duplicate space and special characters.

def remove_irrelevant_info(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"http\S+|www.\S+|#\S+|<.*?>|\(|\)|\d+", "", x)) # Cleans up URL, hashtags, parenthesis, and numbers.

def reduce_characters(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"[^\w\s]|(.)\1+", "", x)) # Remove excessive punctuation and repeated characters

def normalize_encoding(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ASCII", "ignore").decode("utf-8")) # Encodes and Decodes the data so that we have consistency in text
        
def clean_chunk(df: pd.DataFrame) -> pd.DataFrame:

    df = df.rename(columns={
        "overall": "rating",
        "reviewTime": "review_date",
        "reviewerID": "reviewer_id",
        "asin": "product_id",
        "reviewText": "review_text",
    })

    df["vote"] = df["vote"].fillna(0)
    df["vote"] = pd.to_numeric(df["vote"].astype("str").str.replace(",","")).astype("int32")
    df = df[df["vote"] >= 5]
    df = filter_comment_length(df, 30)
    df.drop(["unixReviewTime", "image", "summary", "reviewerName"], axis=1, inplace=True)
    
    df["review_date"] = pd.to_datetime(df["review_date"], format="%m %d, %Y")
    df["style"] = df["style"].apply(extract_value)

    df["review_text"] = df["review_text"].str.lower()
    df["review_text"] = remove_irrelevant_info(df["review_text"])
    df["review_text"] = reduce_characters(df["review_text"])
    df["review_text"] = normalize_encoding(df["review_text"])
    return df

In [None]:
raw_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(BOOKS_RAW_DIR)))]

for raw_file_name in raw_file_names:
    raw_df_list = []
    raw_df = pd.read_json(os.path.join(BOOKS_RAW_DIR, raw_file_name), lines=True)
   
    raw_df = clean_chunk(raw_df)
    print(f"{raw_file_name} DIMENSIONS: {raw_df.shape}")
    clean_dir = os.path.join(BOOKS_CLEAN_DIR, raw_file_name)
    raw_df.to_json(clean_dir, orient="records")

    del raw_df
    gc.collect()

## Sentiment Analysis

In [14]:
def batch_text(column: pd.Series, 
               batch_size: int) -> list:
    
    batches = np.array_split(np.array(column.values), int(np.ceil(len(column)/ batch_size)))
    batches = [batch.tolist() for batch in batches]
    return batches

def load_sentiment_model(model: str):

    return TextClassifier.load(model)

def is_float(value):
      
      try:
        float(value)
        return True
      except ValueError:
        return False

def analyze_sentiment(model, 
                      batch: np.ndarray):
    
    score = []
    for sentence in batch:
        sentence = Sentence(sentence)
        model.predict(sentence, verbose=False)
        try:
            process = re.sub(r"\(|\)","",str(sentence.labels[0]))
            number = [float(s) for s in  process.split() if is_float(s) is True]
            if "POSITIVE" in process:
                score.append(number[0])
            elif "NEGATIVE" in process:
                score.append(-number[0])
        except IndexError:
            print(sentence)
            score.append(np.nan)
    return score
   
def process_batches(models: list[str], 
                    batches: list):
                    
    models_loaded = [load_sentiment_model(model) for model in models]
    print("[Starting process...]")
    sentiment_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers= NUM_CORES) as executor:
        sentiment_tasks = [executor.submit(analyze_sentiment, model, batch) for model, batch in zip(models_loaded, batches)]
        for completed_task in concurrent.futures.as_completed(sentiment_tasks):
            result = completed_task.result()
            sentiment_results.extend(result)
            print("[Finished with a Batch]")
        return sentiment_results

In [None]:
clean_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(BOOKS_CLEAN_DIR)))]

for clean_file_name in clean_file_names[4:5]:
    clean_df = pd.read_json(os.path.join(BOOKS_CLEAN_DIR, clean_file_name), orient="columns")
    batch_size = math.ceil(len(clean_df)/NUM_CORES) 
       
    batches = batch_text(clean_df["review_text"], batch_size)
    clean_df["sentiment_score"] = process_batches(["en-sentiment"] * len(batches), batches)
    sentiment_dir = os.path.join(BOOKS_SENTIMENT_DIR, clean_file_name)
    clean_df.to_json(sentiment_dir, orient="records")

## Neural Network Preprocessing

In [15]:
merged_df = pd.DataFrame()
for filename in os.listdir(BOOKS_SENTIMENT_DIR):
    sentiment_df = pd.read_json(os.path.join(BOOKS_SENTIMENT_DIR, filename), orient="columns")
    sentiment_df.drop(columns=["review_text"], inplace=True)
    merged_df = pd.concat([merged_df, sentiment_df])
    
merged_df.reset_index(drop=True, inplace=True)
merged_df.head()

Unnamed: 0,rating,verified,review_date,reviewer_id,product_id,style,vote,review_len,sentiment_score
0,4,False,1087776000000,A2NJO6YE954DBH,1712799,Hardcover,6,298,0.9999
1,5,False,982972800000,A1K1JW1C5CUSUZ,1712799,Hardcover,23,431,0.9937
2,3,False,1089244800000,A1JS302JFHH9DJ,2006448,Hardcover,9,273,0.9999
3,5,True,1079049600000,A26QTCZG2XR3JH,2006448,Hardcover,15,277,-0.6053
4,5,False,1071360000000,A36X9BU9JB8KCE,2006448,Hardcover,5,495,0.9994


In [16]:
ranges = [(-1.0000, -0.5000), (-0.5000, 0.0000), (0.0000, 0.5000), (0.5000, 1.0000)] # Define the ranges

counts = {f"{r[0]} to {r[1]}": 0 for r in ranges} # Initialize a dictionary to store the counts

# Count the values within each range
for value in merged_df["sentiment_score"]:
    for r in ranges:
        if r[0] <= value < r[1]:
            counts[f"{r[0]} to {r[1]}"] += 1

for r, count in counts.items():
    print(f"Range {r}: {count}")

Range -1.0 to -0.5: 198373
Range -0.5 to 0.0: 4
Range 0.0 to 0.5: 0
Range 0.5 to 1.0: 192154


In [17]:
def quantile_transformation(data: np.ndarray, 
                            type: typing.Literal["uniform","normal"]) -> np.ndarray:
    
    qt = QuantileTransformer(output_distribution=type)
    return  qt.fit_transform(data)

In [18]:
merged_df.dropna(inplace=True)
sentiment_score = np.array(merged_df["sentiment_score"]).reshape(-1,1) # Needs to be 2D to use with QuantileTransformer
merged_df["sentiment_uq"] = quantile_transformation(sentiment_score, "uniform") # Reshape into uniform distribution -> Everything will be equally weighted
merged_df["sentiment_nq"] = quantile_transformation(sentiment_score, "normal") # reshape into normal distribution -> will create bias towards average values -0.5 to 0.5

merged_df["reviewer_index"] = merged_df["reviewer_id"].astype("category").cat.codes
merged_df["product_index"] = merged_df["product_id"].astype("category").cat.codes

In [19]:
def svds_dot(U: np.ndarray, 
             Sigma: np.ndarray, 
             VT: np.ndarray, 
             num_rows: int, 
             num_cols: int, 
             embedding_length: int, 
             chunk_size: int) -> np.ndarray:

    pred_matrix = sp.lil_matrix((num_rows, num_cols)) # This reduces the memory needed to create a matrix with large dimensions

    for i in range(0, num_rows, chunk_size):
        for j in range(0, num_cols, chunk_size):
            for k in range(0, embedding_length, chunk_size):
                u_chunk = U[i:i+chunk_size, k:k+chunk_size]
                sigma_chunk = Sigma[k:k+chunk_size]
                vt_chunk = VT[k:k+chunk_size, j:j+chunk_size]
=
                pred_matrix[i:i+chunk_size, j:j+chunk_size] += np.einsum("ik,kk,kj->ij", u_chunk, np.diag(sigma_chunk), vt_chunk)
    return pred_matrix

In [20]:
sparse_matrix = sp.csr_matrix((merged_df["rating"], 
                               (merged_df["reviewer_index"], merged_df["product_index"])), 
                               shape=(len(merged_df["reviewer_index"]), len(merged_df["product_index"])), 
                               dtype=np.float64)

embedding_length = 64
U, Sigma, VT = svds(sparse_matrix, k = embedding_length)

num_rows, num_cols = sparse_matrix.shape
chunk_size = 10000

In [21]:
pred_matrix = svds_dot(U, Sigma, VT, num_rows, num_cols, embedding_length, chunk_size)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

pred_matrix = scaler.fit_transform(pred_matrix)
pred_matrix = pd.DataFrame(pred_matrix, index = sparse_matrix.index, columns = sparse_matrix.columns)

In [None]:
def find_pivot_value(x):
    val = pred_matrix.loc[x[0],x[1]]
    return val

predicted_df = merged_df.copy()
predicted_df["Pred_Rating"] = predicted_df[['User-ID','Book-ID']].apply(find_pivot_value, axis = 1)

predicted_df.head(10)