# Recommender System with Deep Learning

## Setup

### Packages & Parameters

In [1]:
import os, gzip, sys, gc
import re

import urllib.request
import multiprocessing as mp
import concurrent.futures

import pandas as pd
import numpy as np

import unicodedata

### Globals

In [2]:
MAIN_DIR = os.path.join("D:" + os.sep, "Code", "PYTHON", "Amazon_Recommender_System")

CODE_DIR = os.path.join(MAIN_DIR, "Code")

ANALYSIS_DIR = os.path.join(MAIN_DIR, "Analysis")
DATA_DIR = os.path.join(MAIN_DIR, "Data")

RAW_DATA_DIR = os.path.join(DATA_DIR, "Raw")
CLEAN_DATA_DIR = os.path.join(DATA_DIR, "Clean")

VIDEO_GAME_RAW_DIR = os.path.join(RAW_DATA_DIR, "Video_Game")
VIDEO_GAME_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Video_Game")
BOOKS_RAW_DIR = os.path.join(RAW_DATA_DIR, "Books")
BOOKS_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Books")

### Directory Setup

In [3]:
def create_directory(dir_list: list) -> None:
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def is_directory_empty(dir_path: str) -> bool:
    return len(os.listdir(dir_path)) == 0

In [4]:
create_directory([CODE_DIR, 
                  ANALYSIS_DIR,
                  DATA_DIR, 
                  RAW_DATA_DIR, 
                  CLEAN_DATA_DIR,
                  VIDEO_GAME_RAW_DIR,
                  VIDEO_GAME_CLEAN_DIR,
                  BOOKS_RAW_DIR,
                  BOOKS_CLEAN_DIR])

## Data Processing

### Loading Data In

In [5]:
# More memory efficient version. Takes longer but is far more consistent than the previous version. 
def process_data(url: str, chunk_size: int, num_workers: int, output_dir: str) -> None:
    with urllib.request.urlopen(url) as response:
        with gzip.open(response, "rt") as gz_file:
            chunk = []
            chunk_count = 0

            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
                for line in gz_file:
                    chunk.append(line)

                    if len(chunk) == chunk_size:
                        executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))
                        chunk = []
                        chunk_count += 1

                # Process the remaining lines in the last chunk
                if chunk:
                    executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))

def process_chunk(chunk: list, filename: str) -> None:
    with open(filename, 'w') as file:
        file.writelines(chunk)
    print(f"Processed chunk: {filename}")

In [6]:
'''
----- PROCESS_DATA -----
GAME URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Video_Games_5.json.gz"
BOOK URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

EXPERIMENTS FOR 27 MILLION
----- THREAD CSV_FILE -----
1. 100000 chunk -> 22min 26sec

----- THREAD JSON_FILE RUNTIMES-----
1. 100000 chunk -> 20min 55sec
2. 100000 chunk -> 49min 1sec -> No idea why this happened on a fresh start
3. 500000 chunk -> 18min 45sec
4. 1000000 chunk -> 17min 5sec

NEW STABLE IMPLEMENTATION
1. 1000000 chunk -> 22min 22sec
'''
url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

raw_data_directory = BOOKS_RAW_DIR
clean_data_directory = BOOKS_CLEAN_DIR
chunk_size = 1000000
num_cores = int(mp.cpu_count()/2)

In [None]:
process_data(url, chunk_size, num_cores, raw_data_directory)

### Cleaning Data

In [7]:
def extract_value(dictionary: dict):
    return dictionary.get("Format:") if isinstance(dictionary, dict) else None

def filter_comment_length(reviews: pd.DataFrame, minimum: int) -> pd.DataFrame:
    reviews = reviews.copy()
    reviews["review_len"] = reviews["review_text"].str.split().str.len()
    reviews = reviews.loc[(reviews["review_len"] > minimum)]
    return reviews

def remove_irrelevant_info(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"http\S+|www.\S+|#\S+|<.*?>", "", x)) # Cleans up URL, hashtags and HTML

def reduce_characters(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"[^\w\s]|(.)\1+", "", x)) # Remove excessive punctuation and repeated characters

def normalize_encoding(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ASCII", "ignore").decode("utf-8")) # Encodes and Decodes the data so that we have consistency in text
        
def clean_chunk(df: pd.DataFrame) -> pd.DataFrame:

    df = df.rename(columns={
        "overall": "rating",
        "reviewTime": "review_date",
        "reviewerID": "reviewer_id",
        "asin": "product_id",
        "reviewText": "review_text",
    })

    df["vote"] = df["vote"].fillna(0)
    df["vote"] = pd.to_numeric(df["vote"].astype("str").str.replace(",","")).astype("int32")
    df = df[df["vote"] >= 5]
    df = filter_comment_length(df, 20)
    df = df.drop(["unixReviewTime", "image", "summary", "reviewerName"], axis=1)
    
    df["review_date"] = pd.to_datetime(df["review_date"], format="%m %d, %Y")
    df["style"] = df["style"].apply(extract_value)

    df["review_text"] = df["review_text"].str.lower()
    df["review_text"] = remove_irrelevant_info(df["review_text"])
    df["review_text"] = reduce_characters(df["review_text"])
    df["review_text"] = normalize_encoding(df["review_text"])
    return df

In [23]:
raw_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(raw_data_directory)))]

for raw_file_name in raw_file_names:
    raw_df_list = []
    raw_df = pd.read_json(os.path.join(raw_data_directory, raw_file_name), lines=True)

    raw_df = clean_chunk(raw_df)
    print(f"{raw_file_name} DIMENSIONS: {raw_df.shape}")
    clean_url = os.path.join(clean_data_directory, raw_file_name)
    raw_df.to_json(clean_url, orient="records")

    del raw_df
    gc.collect()

chunk_0.json DIMENSIONS: (103953, 9)
chunk_1.json DIMENSIONS: (87089, 9)
chunk_2.json DIMENSIONS: (111328, 9)
chunk_3.json DIMENSIONS: (98099, 9)
chunk_4.json DIMENSIONS: (75862, 9)
chunk_5.json DIMENSIONS: (100224, 9)
chunk_6.json DIMENSIONS: (80360, 9)
chunk_7.json DIMENSIONS: (111614, 9)
chunk_8.json DIMENSIONS: (115463, 9)
chunk_9.json DIMENSIONS: (98534, 9)
chunk_10.json DIMENSIONS: (147581, 9)
chunk_11.json DIMENSIONS: (92743, 9)
chunk_12.json DIMENSIONS: (74607, 9)
chunk_13.json DIMENSIONS: (87355, 9)
chunk_14.json DIMENSIONS: (48748, 9)
chunk_15.json DIMENSIONS: (51401, 9)
chunk_16.json DIMENSIONS: (42985, 9)
chunk_17.json DIMENSIONS: (44389, 9)
chunk_18.json DIMENSIONS: (124999, 9)
chunk_19.json DIMENSIONS: (89712, 9)
chunk_20.json DIMENSIONS: (91282, 9)
chunk_21.json DIMENSIONS: (38537, 9)
chunk_22.json DIMENSIONS: (46373, 9)
chunk_23.json DIMENSIONS: (55919, 9)
chunk_24.json DIMENSIONS: (65645, 9)
chunk_25.json DIMENSIONS: (29545, 9)
chunk_26.json DIMENSIONS: (43437, 9)
chun

## Sentiment Analysis

In [15]:
from flair.models import TextClassifier
from flair.data import Sentence

def batch_text(column: pd.Series, batch_size: int) -> list:
    batches = np.array_split(column.values, int(np.ceil(len(column)/ batch_size)))
    return batches

def load_sentiment_model(model: str):
    return TextClassifier.load(model)

def analyze_sentiment(model, batch: np.ndarray, mini_batch_size: int):
    processed_batch = [Sentence(sentence) for sentence in batch]
    model.predict(processed_batch, mini_batch_size=mini_batch_size, verbose=False)
    print(processed_batch)
    return processed_batch
    # processed_batch = processed_batch.replace("("," ").replace(")"," ")
    # number = [float(s) for s in score.split() if isfloat(s) is True]
    
    # if "POSITIVE" in score:
    #     return number[0]
    # elif "NEGATIVE" in score:
    #     return -number[0]

def process_batches(models: list, batches: np.ndarray, mini_batch_size: int, num_cores: int):
    models_loaded = [load_sentiment_model(model) for model in models]

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
        sentiment_tasks = [executor.submit(analyze_sentiment, model, batch, mini_batch_size) for model, batch in zip(models_loaded, batches)]
        concurrent.futures.wait(sentiment_tasks)
        sentiment_results = [task.result() for task in sentiment_tasks]
        return sentiment_results

clean_file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(raw_data_directory)))]

for clean_file_name in clean_file_names[0:1]:
    clean_df = pd.read_json(os.path.join(clean_data_directory, clean_file_name), orient="columns")[0:1000]
    batch_size = len(clean_df)/8
    mini_batch_size = 100
    batches = batch_text(clean_df["review_text"], batch_size)

    sentiment_results = process_batches(["en-sentiment"] * len(batches), batches, mini_batch_size, num_cores)
    sentiment_column = pd.concat(sentiment_results)
    clean_df["sentiment_score"] = sentiment_column

125.0


Batch inference:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A

[A[A

[A[A


[A[A[A


[A[A[A



[A[A[A[A



[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A





Batch inference:  50%|█████     | 1/2 [01:45<01:45, 105.84s/it]





[A[A[A[A[A[A



Batch inference: 100%|██████████| 2/2 [02:01<00:00, 60.52s/it] 


[Sentence[291]: "great day for up is a unique dr seu bk and you can te this just by lking at the coverthat is because while the bk is wrien by dr seu it features the joy drawings of the english artist quentin blakeuntil this point every time i have read a bk wrien by dr seu it was also iustrated by dr seu and when somebody else did the drawings dr seu used the name theo lesieg which is geisel backwardsso the fact that this is a real dr seu bk drawn by somebody else is prey special this bright and early bk provides rhymed text and iustrations introducing the many meanings of the word up as seu and blake show begiing readers that this is a great day for upyou get the point half way through the bk but lile kids should be able to hand on longer especiay when they are reading the bk for themselvesbesides by the end of great day for up we get to the point where everyone on earth is up with one very important and rather ironic exception as with a of the bright and early bks for begiing begier


[A

[A[A





Batch inference: 100%|██████████| 2/2 [02:00<00:00, 60.30s/it]




Batch inference: 100%|██████████| 2/2 [02:03<00:00, 61.64s/it]









[A[A[A[A[A


[A[A[A



Batch inference: 100%|██████████| 2/2 [02:04<00:00, 62.20s/it]





Batch inference: 100%|██████████| 2/2 [02:05<00:00, 62.95s/it]


Batch inference: 100%|██████████| 2/2 [02:05<00:00, 62.91s/it]









Batch inference: 100%|██████████| 2/2 [02:05<00:00, 62.80s/it]







Batch inference: 100%|██████████| 2/2 [02:06<00:00, 63.34s/it]




ValueError: All objects passed were None

In [None]:
# Retrieve the results as they become available
for future in concurrent.futures.as_completed(future_results):
    result = future.result()
    print(result.to_original_text())
    for label in result.labels:
        print(f"- {label.value} ({label.score})")
    print()

In [None]:
'''
Function takes a dataset in and them filters the users and products that are above a threshold
Returns a cleaned dataframe of User & Product IDs with their Ratings
'''
def user_product_threshold_reviews(df, user_threshold, product_threshold):
    product_count = df.groupby(by=["asin"])["overall"].count().reset_index().rename(columns={"overall":"product_count"})
    product_count = product_count.query("product_count >= @product_threshold")
    
    user_count = df.groupby(by=["reviewerID"])["overall"].count().reset_index().rename(columns={"overall":"user_count"})
    user_count = user_count.query("user_count >= @user_threshold")
    
    combined_df = pd.merge(product_count, sa_df, left_on="asin", right_on="asin", how="left").merge(user_count, left_on="reviewerID", right_on="reviewerID", how="inner")
    
    combined_gb = combined_df.groupby(by = ["reviewerID","asin", "reviewText"],as_index=False).mean()

    combined_final = combined_gb.rename(columns = {"reviewerID":"User_ID", "asin": "Product_ID", "overall":"Rating"})
    # scaler = MinMaxScaler()
    # combined_final['Rating'] = combined_final['Rating'].values.astype(float)
    # rating_scaled = pd.DataFrame(scaler.fit_transform(combined_final['Rating'].values.reshape(-1,1)))
    # combined_final['Rating'] = rating_scaled

    return combined_final

sentiment_df = user_product_threshold_reviews(filter_df, 100, 100)
print("Number of Unique Product", sentiment_df["Product_ID"].nunique())
print("Number of Unique Users", sentiment_df["User_ID"].nunique())

In [None]:
%pip install flair
from flair.models import TextClassifier
from flair.data import Sentence
import re
def flair_sentiment(df):
    def isfloat(value):
        try:
            float(value)
            return True
        except ValueError:
            return False
        
        sia = TextClassifier.load("en-sentiment")
        
        def flair_prediction(x):
            sentence = Sentence(x)
            sia.predict(sentence)
            score = str(sentence.labels[0])
            score = score.replace('(',' ').replace(')',' ')
            number = [float(s) for s in score.split() if isfloat(s) is True]
            
            if "POSITIVE" in score:
                return number[0]
            elif "NEGATIVE" in score:
                return -number[0]
        
    df["flair_sentiment"] = df["reviewText"].apply(flair_prediction)
    return df

In [None]:
reviews_list = np.array_split(reviews_filtered,100)

flair_list = []
for i in range (0, len(reviews_list)):
    flair_review = flair_sentiment(reviews_list[i])
    flair_list.append(flair_review)
    
reviews_filtered = pd.concat(flair_list)

In [None]:
reviews_filtered = pd.read_csv('reviews_filtered_100_100.csv')