# Recommender System with Deep Learning

## Setup

### Packages & Parameters

In [1]:
import os
import sys
import gc

import urllib.request
import multiprocessing as mp
import concurrent.futures

import gzip
import unicodedata
import math
import re

import pandas as pd
import numpy as np

from flair.models import TextClassifier
from flair.data import Sentence



### Globals

In [76]:
MAIN_DIR = os.path.join("D:" + os.sep, "Code", "PYTHON", "Amazon_Recommender_System")

CODE_DIR = os.path.join(MAIN_DIR, "Code")

ANALYSIS_DIR = os.path.join(MAIN_DIR, "Analysis")
DATA_DIR = os.path.join(MAIN_DIR, "Data")

RAW_DATA_DIR = os.path.join(DATA_DIR, "Raw")
CLEAN_DATA_DIR = os.path.join(DATA_DIR, "Clean")

BOOKS_RAW_DIR = os.path.join(RAW_DATA_DIR, "Books")
BOOKS_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Books")
BOOKS_SENTIMENT_DIR = os.path.join(BOOKS_CLEAN_DIR, "Sentiment")

### Directory Setup

In [77]:
def create_directory(dir_list: list) -> None:
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def is_directory_empty(dir_path: str) -> bool:
    return len(os.listdir(dir_path)) == 0

In [78]:
create_directory([CODE_DIR, 
                  ANALYSIS_DIR,
                  DATA_DIR, 
                  RAW_DATA_DIR, 
                  CLEAN_DATA_DIR,
                  BOOKS_RAW_DIR,
                  BOOKS_CLEAN_DIR,
                  BOOKS_SENTIMENT_DIR,
                  ])

## Data Processing

### Loading Data In

In [74]:
# More memory efficient version. Takes longer but is far more consistent than the previous version. 
def process_data(url: str, chunk_size: int, num_workers: int, output_dir: str) -> None:
    with urllib.request.urlopen(url) as response:
        with gzip.open(response, "rt") as gz_file:
            chunk = []
            chunk_count = 0

            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
                for line in gz_file:
                    chunk.append(line)

                    if len(chunk) == chunk_size:
                        executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))
                        chunk = []
                        chunk_count += 1

                # Process the remaining lines in the last chunk
                if chunk:
                    executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))

def process_chunk(chunk: list, filename: str) -> None:
    with open(filename, 'w') as file:
        file.writelines(chunk)
    print(f"Processed chunk: {filename}")

In [75]:
'''
----- PROCESS_DATA -----
GAME URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Video_Games_5.json.gz"
BOOK URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

EXPERIMENTS FOR 27 MILLION
----- THREAD CSV_FILE -----
1. 100000 chunk -> 22min 26sec

----- THREAD JSON_FILE RUNTIMES-----
1. 100000 chunk -> 20min 55sec
2. 100000 chunk -> 49min 1sec -> No idea why this happened on a fresh start
3. 500000 chunk -> 18min 45sec
4. 1000000 chunk -> 17min 5sec

NEW STABLE IMPLEMENTATION
1. 1000000 chunk -> 22min 22sec
'''
url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

raw_data_directory = BOOKS_RAW_DIR
clean_data_directory = BOOKS_CLEAN_DIR
sentiment_data_directory = BOOKS_SENTIMENT_DIR

chunk_size = 1000000
num_cores = int(mp.cpu_count()/2)

In [None]:
process_data(url, chunk_size, num_cores, raw_data_directory)

### Cleaning Data

In [67]:
def extract_value(dictionary: dict):
    return dictionary.get("Format:") if isinstance(dictionary, dict) else None

def filter_comment_length(reviews: pd.DataFrame, minimum: int) -> pd.DataFrame:
    reviews = reviews.copy()
    reviews["review_len"] = reviews["review_text"].str.split().str.len()
    reviews = reviews.loc[(reviews["review_len"] > minimum)]
    return reviews

def remove_irrelevant_info(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"http\S+|www.\S+|#\S+|<.*?>|\(|\)|\d+", "", x)) # Cleans up URL, hashtags, parenthesis, and numbers.

def reduce_characters(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"[^\w\s]|(.)\1+", "", x)) # Remove excessive punctuation and repeated characters

def normalize_encoding(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ASCII", "ignore").decode("utf-8")) # Encodes and Decodes the data so that we have consistency in text
        
def clean_chunk(df: pd.DataFrame) -> pd.DataFrame:

    df = df.rename(columns={
        "overall": "rating",
        "reviewTime": "review_date",
        "reviewerID": "reviewer_id",
        "asin": "product_id",
        "reviewText": "review_text",
    })

    df["vote"] = df["vote"].fillna(0)
    df["vote"] = pd.to_numeric(df["vote"].astype("str").str.replace(",","")).astype("int32")
    df = df[df["vote"] >= 5]
    df = filter_comment_length(df, 20)
    df = df.drop(["unixReviewTime", "image", "summary", "reviewerName"], axis=1)
    
    df["review_date"] = pd.to_datetime(df["review_date"], format="%m %d, %Y")
    df["style"] = df["style"].apply(extract_value)

    df["review_text"] = df["review_text"].str.lower()
    df["review_text"] = remove_irrelevant_info(df["review_text"])
    df["review_text"] = reduce_characters(df["review_text"])
    df["review_text"] = normalize_encoding(df["review_text"])
    return df

In [68]:
raw_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(raw_data_directory)))]

for raw_file_name in raw_file_names:
    raw_df_list = []
    raw_df = pd.read_json(os.path.join(raw_data_directory, raw_file_name), lines=True)
   
    raw_df = clean_chunk(raw_df)
    print(f"{raw_file_name} DIMENSIONS: {raw_df.shape}")
    clean_dir = os.path.join(clean_data_directory, raw_file_name)
    raw_df.to_json(clean_dir, orient="records")

    del raw_df
    gc.collect()

chunk_0.json DIMENSIONS: (103953, 9)
chunk_1.json DIMENSIONS: (87089, 9)
chunk_2.json DIMENSIONS: (111328, 9)
chunk_3.json DIMENSIONS: (98099, 9)
chunk_4.json DIMENSIONS: (75862, 9)
chunk_5.json DIMENSIONS: (100224, 9)
chunk_6.json DIMENSIONS: (80360, 9)
chunk_7.json DIMENSIONS: (111614, 9)
chunk_8.json DIMENSIONS: (115463, 9)
chunk_9.json DIMENSIONS: (98534, 9)
chunk_10.json DIMENSIONS: (147581, 9)
chunk_11.json DIMENSIONS: (92743, 9)
chunk_12.json DIMENSIONS: (74607, 9)
chunk_13.json DIMENSIONS: (87355, 9)
chunk_14.json DIMENSIONS: (48748, 9)
chunk_15.json DIMENSIONS: (51401, 9)
chunk_16.json DIMENSIONS: (42985, 9)
chunk_17.json DIMENSIONS: (44389, 9)
chunk_18.json DIMENSIONS: (124999, 9)
chunk_19.json DIMENSIONS: (89712, 9)
chunk_20.json DIMENSIONS: (91282, 9)
chunk_21.json DIMENSIONS: (38537, 9)
chunk_22.json DIMENSIONS: (46373, 9)
chunk_23.json DIMENSIONS: (55919, 9)
chunk_24.json DIMENSIONS: (65645, 9)
chunk_25.json DIMENSIONS: (29545, 9)
chunk_26.json DIMENSIONS: (43437, 9)
chun

## Sentiment Analysis

In [80]:
def batch_text(column: pd.Series, batch_size: int) -> list:
    batches = np.array_split(np.array(column.values), int(np.ceil(len(column)/ batch_size)))
    batches = [batch.tolist() for batch in batches]
    return batches

def load_sentiment_model(model: str):
    return TextClassifier.load(model)

def is_float(value):
      try:
        float(value)
        return True
      except ValueError:
        return False
      
def analyze_sentiment(model, batch: np.ndarray):
    score = []
    for sentence in batch:
        sentence = Sentence(sentence)
        model.predict(sentence, verbose=True)
        process = re.sub(r"\(|\)","",str(sentence.labels[0]))
        number = [float(s) for s in  process.split() if is_float(s) is True]
        if "POSITIVE" in process:
            score.append(number[0])
        elif "NEGATIVE" in process:
            score.append(-number[0])
    return score
   
def process_batches(models: list, batches: list):
    models_loaded = [load_sentiment_model(model) for model in models]
    sentiment_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers= 4) as executor:
        sentiment_tasks = [executor.submit(analyze_sentiment, model, batch) for model, batch in zip(models_loaded, batches)]
        concurrent.futures.wait(sentiment_tasks)
        [sentiment_results.extend(task.result()) for task in sentiment_tasks]
        return sentiment_results

In [None]:
clean_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(raw_data_directory)))]

for clean_file_name in clean_file_names[0:1]:
    clean_df = pd.read_json(os.path.join(clean_data_directory, clean_file_name), orient="columns")
    batch_size = math.ceil(len(clean_df)/num_cores) 
    batches = batch_text(clean_df["review_text"], batch_size)
    clean_df["sentiment_score"] = process_batches(["en-sentiment"] * len(batches), batches)

    sentiment_dir = os.path.join(sentiment_data_directory, clean_file_name)
    clean_df.to_json(sentiment_dir, orient="records")