# Recommender System with Deep Learning

## Setup

### Packages & Parameters

In [2]:
import os, gzip, sys, gc
import re

import urllib.request
import multiprocessing as mp
import concurrent.futures

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

### Globals

In [3]:
MAIN_DIR = os.path.join("D:" + os.sep, "Code", "PYTHON", "Amazon_Recommender_System")

CODE_DIR = os.path.join(MAIN_DIR, "Code")

ANALYSIS_DIR = os.path.join(MAIN_DIR, "Analysis")
DATA_DIR = os.path.join(MAIN_DIR, "Data")

RAW_DATA_DIR = os.path.join(DATA_DIR, "Raw")
CLEAN_DATA_DIR = os.path.join(DATA_DIR, "Clean")

VIDEO_GAME_RAW_DIR = os.path.join(RAW_DATA_DIR, "Video_Game")
VIDEO_GAME_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Video_Game")
BOOKS_RAW_DIR = os.path.join(RAW_DATA_DIR, "Books")
BOOKS_CLEAN_DIR = os.path.join(CLEAN_DATA_DIR, "Books")

### Directory Setup

In [4]:
def create_directory(dir_list: list) -> None:
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def is_directory_empty(dir_path: str) -> bool:
    return len(os.listdir(dir_path)) == 0

In [5]:
create_directory([CODE_DIR, 
                  ANALYSIS_DIR,
                  DATA_DIR, 
                  RAW_DATA_DIR, 
                  CLEAN_DATA_DIR,
                  VIDEO_GAME_RAW_DIR,
                  VIDEO_GAME_CLEAN_DIR,
                  BOOKS_RAW_DIR,
                  BOOKS_CLEAN_DIR])

## Data Processing

### Loading Data In

In [6]:
# More memory efficient version. Takes longer but is far more consistent than the previous version. 
def process_data(url: str, chunk_size: int, num_workers: int, output_dir: str) -> None:
    with urllib.request.urlopen(url) as response:
        with gzip.open(response, "rt") as gz_file:
            chunk = []
            chunk_count = 0

            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
                for line in gz_file:
                    chunk.append(line)

                    if len(chunk) == chunk_size:
                        executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))
                        chunk = []
                        chunk_count += 1

                # Process the remaining lines in the last chunk
                if chunk:
                    executor.submit(process_chunk, list(chunk), os.path.join(output_dir, f"chunk_{chunk_count}.json"))

def process_chunk(chunk: list, filename: str) -> None:
    with open(filename, 'w') as file:
        file.writelines(chunk)
    print(f"Processed chunk: {filename}")

In [7]:
'''
----- PROCESS_DATA -----
GAME URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Video_Games_5.json.gz"
BOOK URL "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"

EXPERIMENTS FOR 27 MILLION
----- THREAD CSV_FILE -----
1. 100000 chunk -> 22min 26sec

----- THREAD JSON_FILE RUNTIMES-----
1. 100000 chunk -> 20min 55sec
2. 100000 chunk -> 49min 1sec -> No idea why this happened on a fresh start
3. 500000 chunk -> 18min 45sec
4. 1000000 chunk -> 17min 5sec

NEW STABLE IMPLEMENTATION
1. 1000000 chunk -> 22min 22sec
'''
url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"
url_data_directory = BOOKS_RAW_DIR
chunk_size = 1000000
num_cores = int(mp.cpu_count()/2)

In [None]:
process_data(url, chunk_size, num_cores, url_data_directory)

### Cleaning Data

In [30]:
def extract_value(dictionary: dict):
    return dictionary.get("Format:") if isinstance(dictionary, dict) else None

def filter_comment_length(reviews: pd.DataFrame, minimum: int) -> pd.DataFrame:
    reviews = reviews.copy()
    reviews["review_len"] = reviews["review_text"].str.split().str.len()
    reviews = reviews.loc[(reviews["review_len"] > minimum)]
    return reviews

def remove_non_text(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))

def tokenize_text(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: nltk.word_tokenize(x))

stop_words = set(stopwords.words("english"))
def remove_stopwords(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: [token for token in x if token not in stop_words])

stemmer = PorterStemmer()
def stemming(reviews: pd.Series) -> pd.Series:
    return reviews.apply(lambda x: [stemmer.stem(token) for token in x])
        
def clean_chunk(df: pd.DataFrame) -> pd.DataFrame:
    df["vote"] = df["vote"].fillna(0)
    df["vote"] = pd.to_numeric(df['vote'].astype("str").str.replace(",","")).astype("int32")
    df  = df[df["vote"] > 5]
    df = filter_comment_length(df, 100)
    df = df.drop(["unixReviewTime", "image", "summary", "reviewerName"], axis=1)
    
    df["review_date"] = pd.to_datetime(df["review_date"], format="%m %d, %Y")
    df["style"] = df["style"].apply(extract_value)

    df["review_text"] = df["review_text"].str.lower()
    df["review_text"] = remove_non_text(df["review_text"])
    df["review_text"] = tokenize_text(df["review_text"])
    df["review_text"] = remove_stopwords(df["review_text"])
    df["review_text"] = stemming(df["review_text"])
    return df

In [28]:
file_names = [f"chunk_{file_num}.json" for file_num in range(len(os.listdir(url_data_directory)))]
for file_name in file_names:
    raw_df_list = []
    raw_df = pd.read_json(os.path.join(url_data_directory, file_name), lines=True)

    raw_df = raw_df.rename(columns={
        "overall": "rating",
        "reviewTime": "review_date",
        "reviewerID": "reviewer_id",
        "asin": "product_id",
        "reviewText": "review_text",
    })

    raw_df = clean_chunk(raw_df)
    print(f"{file_name} DIMENSIONS: {raw_df.shape}")
    clean_url = os.path.join(url_data_directory, file_name).replace("Raw", "Clean")
    raw_df.to_json(clean_url, orient="records")
    del raw_df
    gc.collect()

chunk_23.json DIMENSIONS: (36238, 9)
chunk_24.json DIMENSIONS: (40533, 9)
chunk_25.json DIMENSIONS: (16429, 9)
chunk_26.json DIMENSIONS: (25489, 9)
chunk_27.json DIMENSIONS: (844, 9)


## Sentiment Analysis

In [None]:
filter_df = df[["reviewerID", "asin", "reviewText", "overall"]].copy()
filter_df.head()

In [None]:
'''
Function takes a dataset in and them filters the users and products that are above a threshold
Returns a cleaned dataframe of User & Product IDs with their Ratings
'''
def user_product_threshold_reviews(df, user_threshold, product_threshold):
    product_count = df.groupby(by=["asin"])["overall"].count().reset_index().rename(columns={"overall":"product_count"})
    product_count = product_count.query("product_count >= @product_threshold")
    
    user_count = df.groupby(by=["reviewerID"])["overall"].count().reset_index().rename(columns={"overall":"user_count"})
    user_count = user_count.query("user_count >= @user_threshold")
    
    combined_df = pd.merge(product_count, sa_df, left_on="asin", right_on="asin", how="left").merge(user_count, left_on="reviewerID", right_on="reviewerID", how="inner")
    
    combined_gb = combined_df.groupby(by = ["reviewerID","asin", "reviewText"],as_index=False).mean()

    combined_final = combined_gb.rename(columns = {"reviewerID":"User_ID", "asin": "Product_ID", "overall":"Rating"})
    # scaler = MinMaxScaler()
    # combined_final['Rating'] = combined_final['Rating'].values.astype(float)
    # rating_scaled = pd.DataFrame(scaler.fit_transform(combined_final['Rating'].values.reshape(-1,1)))
    # combined_final['Rating'] = rating_scaled

    return combined_final

sentiment_df = user_product_threshold_reviews(filter_df, 100, 100)
print("Number of Unique Product", sentiment_df["Product_ID"].nunique())
print("Number of Unique Users", sentiment_df["User_ID"].nunique())

In [None]:
%pip install flair
from flair.models import TextClassifier
from flair.data import Sentence
import re
def flair_sentiment(df):
    def isfloat(value):
        try:
            float(value)
            return True
        except ValueError:
            return False
        
        sia = TextClassifier.load("en-sentiment")
        
        def flair_prediction(x):
            sentence = Sentence(x)
            sia.predict(sentence)
            score = str(sentence.labels[0])
            score = score.replace('(',' ').replace(')',' ')
            number = [float(s) for s in score.split() if isfloat(s) is True]
            
            if "POSITIVE" in score:
                return number[0]
            elif "NEGATIVE" in score:
                return -number[0]
        
    df["flair_sentiment"] = df["reviewText"].apply(flair_prediction)
    return df

In [None]:
reviews_list = np.array_split(reviews_filtered,100)

flair_list = []
for i in range (0, len(reviews_list)):
    flair_review = flair_sentiment(reviews_list[i])
    flair_list.append(flair_review)
    
reviews_filtered = pd.concat(flair_list)

In [None]:
reviews_filtered = pd.read_csv('reviews_filtered_100_100.csv')