In [1]:
import gzip
import json
from collections import defaultdict
import math
import numpy as np
import random
from sklearn import linear_model
import tensorflow as tf
import scipy
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from sklearn.metrics import roc_auc_score

2024-12-01 03:11:25.380239: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 03:11:25.423468: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-01 03:11:25.423503: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-01 03:11:25.424741: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 03:11:25.432828: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Path to the .json.gz file
file_path = 'goodreads_reviews_spoiler.json.gz'
ct = 0
# Open and read the file line by line
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    data = []
    for line in f:
        # Parse each line as a separate JSON object
        try:
            ct += 1
            review = json.loads(line)
            #review.pop('has_spoiler', None)
            data.append(review)
            if ct == 12000:
                break
        except json.JSONDecodeError as e:
            print(f"Error decoding line: {e}")

# Example: Print the first parsed review
if data:
    print(data[0])
else:
    print("No data was parsed.")

{'user_id': '8842281e1d1347389f2ab93d60773d4d', 'timestamp': '2017-08-30', 'review_sentences': [[0, 'This is a special book.'], [0, 'It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind.'], [0, 'This is what I love about good science fiction - it pushes your thinking about where things can go.'], [0, "It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I've read."], [0, 'For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc.'], [0, 'It is a book about science, and aliens.'], [0, 'The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell.'], [1, 'Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though.'], [1, 'But what woul

In [3]:
train_size = 10000
test_size = 1000


ratingsTrain = []
for i in data[:train_size]:
    if i['has_spoiler']:
        copy_to_upload = i
        non_spoilers = [sentence for sentence in i['review_sentences'] if sentence[0] == 0]
        spoilers = [sentence for sentence in i['review_sentences'] if sentence[0] == 1]
        
        selected_non_spoilers = []
        selected_spoilers = []
        if len(non_spoilers) > 0:
            selected_non_spoilers = random.choices(non_spoilers, k=2)
        if len(spoilers) > 0:
            selected_spoilers = random.choices(spoilers, k=2)
            
        new_sentences = selected_non_spoilers + selected_spoilers
        copy_to_upload['review_sentences'] = new_sentences

        ratingsTrain.append(copy_to_upload)

ratingsTest = data[train_size:train_size + test_size]
ratingsTrain[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'timestamp': '2017-08-30',
 'review_sentences': [[0, 'It is a book about science, and aliens.'],
  [0, 'This is a special book.'],
  [1,
   "It was a smart way to build empathy with them and also understand what they've gone through across so many centuries."],
  [1,
   '"In the long history of scientific progress, how many protons have been smashed apart in accelerators by physicists?']],
 'rating': 5,
 'has_spoiler': True,
 'book_id': '18245960',
 'review_id': 'dfdbb7b0eb5a7e4c26d59a937e2e5feb'}

In [13]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to evaluation mode

def get_bert_embedding(sentence):
    """Generate BERT embedding for a single sentence."""
    tokens = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**tokens)
        # Use the [CLS] token's embedding as the sentence embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()

usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerItem = defaultdict(list)
reviewsPerUser = defaultdict(list)
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

for d in data:
    usersPerItem[d['book_id']].add(d['user_id'])
    itemsPerUser[d['user_id']].add(d['book_id'])
    ratingsPerUser[d['user_id']].append(d['rating'])
    ratingsPerItem[d['book_id']].append(d['rating'])
    for i in d['review_sentences']:
        reviewsPerItem[d['book_id']].append(i[0])
        reviewsPerUser[d['user_id']].append(i[0])

bookCount = defaultdict(int)
totalRead = 0

for d in data:
    bookCount[d['book_id']] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()
PopularityBooks = [x[-1] for x in mostPopular]

mostPopularUsers = [[len(value), key] for key, value in itemsPerUser.items()]
mostPopularUsers.sort()
mostPopularUsers.reverse()
PopularityUsers = [x[-1] for x in mostPopularUsers]

userIDs, itemIDs = {}, {}
for d in data:
    if d['user_id'] not in userIDs: userIDs[d['user_id']] = len(userIDs)
    if d['book_id'] not in itemIDs: itemIDs[d['book_id']] = len(itemIDs)

nUsers, nItems = len(userIDs), len(itemIDs)
Xui = scipy.sparse.lil_matrix((nUsers, nItems))
for d in data:
    Xui[userIDs[d['user_id']], itemIDs[d['book_id']]] = 1 

Xui_csr = scipy.sparse.csr_matrix(Xui)

book_popularity = np.sum(Xui_csr, axis=0).A1
books_read_by_user = Xui_csr.sum(axis=1).A1

In [14]:
avg_user_pop = sum(book_popularity) / len(book_popularity)
avg_user_freq = sum(books_read_by_user) / len(books_read_by_user)
avg_pop_by_freq = avg_user_pop * avg_user_freq
avg_book_rank = len(PopularityBooks) / 2
avg_user_rank = len(PopularityUsers) / 2
#avg_ct_book_spoilers = 0.179359243697479
spoiler_flat = [sum(sublist) for sublist in reviewsPerItem.values()]
avg_ct_book_spoilers = sum(spoiler_flat) / len(spoiler_flat) if spoiler_flat else 0
spoiler_flat_2 = [sum(sublist) for sublist in reviewsPerUser.values()]
#avg_ct_user_spoilers = 10.841269841269842
avg_ct_user_spoilers = sum(spoiler_flat_2) / len(spoiler_flat_2) if spoiler_flat_2 else 0
flattened_1 = [item for sublist in ratingsPerItem.values() for item in sublist]
ovr_avg_rating_item =  sum(flattened_1) / len(flattened_1) if flattened_1 else 0
flattened_2 = [item for sublist in ratingsPerUser.values() for item in sublist]
ovr_avg_rating_user = sum(flattened_2) / len(flattened_2) if flattened_2 else 0

In [None]:
users = set(userIDs.keys())
books = set(itemIDs.keys())

from datetime import datetime

def create_vector(review):
    global users, books, avg_ct_book_spoilers, avg_ct_user_spoilers, ovr_avg_rating_item, ovr_avg_rating_user

    u = review['user_id']
    b = review['book_id']
    timestamp = review['timestamp']
    sentence = review['BERT_data']
    rating = review['rating']
    # Default feature values
    book_in = b in books
    user_in = u in users
     if len(sentence) == 0:
        sentence = np.zeros(768) # Placeholder for BERT embedding
    temporal_feature = 0           # Placeholder for temporal information
    user_rating = rating if rating is not None else 0  # Use given rating

    # Book and user interaction features
    if book_in and user_in:
        book_pop = book_popularity[itemIDs[b]]
        user_freq = books_read_by_user[userIDs[u]]
        pop_by_freq = book_pop * user_freq
        book_rank = PopularityBooks.index(b)
        user_rank = PopularityUsers.index(u)
        ct_book_spoilers = sum(reviewsPerItem[b])
        ct_user_spoilers = sum(reviewsPerUser[u])
        avg_rating_item = ratingsPerItem[b]
        avg_rating_item = sum(avg_rating_item) / len(avg_rating_item)
        avg_rating_user = ratingsPerUser[u]
        avg_rating_user = sum(avg_rating_user) / len(avg_rating_user)
    elif not user_in and not book_in:
        book_pop = len(books)
        user_freq = 0
        pop_by_freq = 0
        book_rank = len(books)
        user_rank = len(users)
        ct_book_spoilers = avg_ct_book_spoilers
        ct_user_spoilers = avg_ct_user_spoilers
        avg_rating_item = ovr_avg_rating_item
        avg_rating_user = ovr_avg_rating_user
    elif not user_in:
        book_pop = book_popularity[itemIDs[b]]
        user_freq = 0
        pop_by_freq = book_pop
        book_rank = PopularityBooks.index(b)
        user_rank = len(PopularityUsers)
        ct_book_spoilers = sum(reviewsPerItem[b])
        ct_user_spoilers = avg_ct_user_spoilers
        avg_rating_item = ratingsPerItem[b]
        avg_rating_item = sum(avg_rating_item) / len(avg_rating_item)
        avg_rating_user = ovr_avg_rating_user
    else:
        book_pop = 0
        user_freq = books_read_by_user[userIDs[u]]
        pop_by_freq = user_freq
        book_rank = len(PopularityBooks)
        user_rank = PopularityUsers.index(u)
        ct_book_spoilers = avg_ct_book_spoilers
        ct_user_spoilers = sum(reviewsPerUser[u])
        avg_rating_item = ovr_avg_rating_item
        avg_rating_user = ratingsPerUser[u]
        avg_rating_user = sum(avg_rating_user) / len(avg_rating_user)

    # Add BERT embedding for the sentence
    bert_embedding = get_bert_embedding(sentence)  # Assuming a defined function

    # Add temporal feature based on timestamp
    try:
        review_date = datetime.strptime(timestamp, "%Y-%m-%d")
        current_date = datetime.now()
        temporal_feature = (current_date - review_date).days  # Time in days
    except ValueError:
        temporal_feature = 0  # Default value if timestamp parsing fails

    # Combine features
    vals = [
        1,  # Bias term
        book_pop,          # Book popularity
        pop_by_freq,       # Popularity scaled by user frequency
        user_rank,         # User ranking
        user_rating,       # User's rating
        temporal_feature,   # Temporal feature
        ct_book_spoilers,
        ct_user_spoilers,
        avg_rating_item,
        avg_rating_user
    ]
    
    # Append BERT embeddings
    vals.extend(sentence.tolist())
    
    return vals

#TAKES A LONG TIME ~30-45 min for just 11k entries
x_train = []
y_train = []
for d in ratingsTrain:
    filtered_data = {key: value for key, value in d.items() if key != 'review_sentences' and key != 'has_spoiler'}
    for sentence in d['review_sentences']:
        bert_embedding = get_bert_embedding(sentence[1])
        filtered_data['BERT_data'] = bert_embedding
        x_train.append(filtered_data)
        y_train.append(sentence[0])

x_valid = []
y_valid = []
for d in ratingsTrain:
    filtered_data = {key: value for key, value in d.items() if key != 'review_sentences' and key != 'has_spoiler'}
    for sentence in d['review_sentences']:
        bert_embedding = get_bert_embedding(sentence[1])
        filtered_data['BERT_data'] = bert_embedding
        x_valid.append(filtered_data)
        y_valid.append(sentence[0])
        

x_train = [create_vector(i) for i in x_train]
x_valid = [create_vector(i) for i in x_valid]

In [None]:
def get_metric(y_true, pred, scores, scores2):
    corr = 0
    tn, tp, fn, fp = 0,0,0,0
    for guess, actual in zip(pred, y_true):
        if guess == actual:
            corr += 1
            if guess == 0:
                tn += 1
            else:
                tp += 1
        else:
            if guess == 0:
                fn += 1
            else:
                fp += 1
    if tp + fp == 0:
        precision = 0
        print("precision error, tp + fp = 0")
    else:
        precision = tp / (tp + fp)
    if tp + fn == 0:
        recall = 0
        print("recall error, tp + fn = 0")
    else:
        recall = tp / (tp + fn)
    print("accuracy: ", corr / len(pred))
    print("tp: ", tp, "tn: ", tn, "fp: ", fp, "fn: ", fn)
    # Calculate F1 Score
    if precision + recall == 0:
        print("F1 error, precision + recall = 0")
    else:
        print("F1: ", 2 * (precision * recall) / (precision + recall))
    #auc = roc_auc_score(y_valid, scores)
    #print(f"AUC: {auc}")
    auc = roc_auc_score(y_valid, scores2)
    print(f"AUC: {auc}")


mod = linear_model.LogisticRegression(C=100, class_weight = 'balanced')
mod.fit(x_train,y_train)
scores = mod.decision_function(x_valid_new)
new_results = {index: score for index, score in enumerate(scores)}     
sorted_items = sorted(new_results.items(), key=lambda x: x[1], reverse=True)

pred = np.zeros(len(x_valid_new))
if len(x_valid_new) % 2 == 1:
    for i in sorted_items[:int(3 * len(x_valid_new)//100) + 1]:
        pred[i[0]] = 1
else:
    for i in sorted_items[:int(3 * len(x_valid_new)//100)]:
        pred[i[0]] = 1

curr_counter = 0
all_pos = 0
first_one = 0
last_one = 0
got_one = False
got_all = 0
while got_one < sum(y_valid) and curr_counter < len(sorted_items):
    if y_valid[sorted_items[curr_counter][0]] == 1:
        got_all += 1
        if got_all == sum(y_valid):
            last_one = curr_counter
        all_pos += curr_counter
        if got_one == False:
            got_one = True 
            first_one = curr_counter
    curr_counter += 1
    
get_metric(y_valid, pred, scores, mod.predict_proba(x_valid_new)[:,1])
print("First spoiler position: ", first_one)
print("Last spoiler position: ", last_one)
print("Average spoiler position: ", all_pos / sum(y_valid))