In [1]:
import gzip
import json
from collections import defaultdict
import numpy as np
import random
from sklearn import linear_model
import scipy
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from sklearn.metrics import roc_auc_score, f1_score
import scipy.sparse

# Path to the .json.gz file
file_path = 'goodreads_reviews_spoiler.json.gz'
ct = 0

# Open and read the file line by line
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    data = []
    for line in f:
        try:
            ct += 1
            review = json.loads(line)
            data.append(review)
            if ct == 70000:
                break
        except json.JSONDecodeError as e:
            print(f"Error decoding line: {e}")

# Example: Print the first parsed review
if data:
    print(data[0])
else:
    print("No data was parsed.")


train_size = 68000
test_size = 1200

ones_minus_zeros = 0
ratingsTrain = []
for i in data[:train_size]:
    if i['has_spoiler']:
        copy_to_upload = i
        ones = sum([sent[0] for sent in copy_to_upload['review_sentences']])
        non_spoilers = [sentence for sentence in i['review_sentences'] if sentence[0] == 0]
        spoilers = [sentence for sentence in i['review_sentences'] if sentence[0] == 1]
        selected_non_spoilers = []
        if ones == 1:
            k = min(3, len(non_spoilers))
            selected_non_spoilers = random.choices(non_spoilers, k=k)
            ones_minus_zeros -= (k-1)
        elif ones == 2:
            k = min(2, len(non_spoilers))
            selected_non_spoilers = random.choices(non_spoilers, k=k)
            ones_minus_zeros -= (k-2)
        elif ones == 3:
            k = min(1, len(non_spoilers))
            selected_non_spoilers = random.choices(non_spoilers, k=k)
            ones_minus_zeros += (3 - k)

        else:
            ones_minus_zeros += len(spoilers)
            
        new_sentences = selected_non_spoilers + spoilers
        copy_to_upload['review_sentences'] = new_sentences

        ratingsTrain.append(copy_to_upload)
    elif ones_minus_zeros > 3:
        copy_to_upload = i
        if (ones_minus_zeros // 2) < len(i['review_sentences']):
            selected_non_spoilers = random.choices(i['review_sentences'], k=ones_minus_zeros // 2)
            ones_minus_zeros -= (ones_minus_zeros // 2)
        else:
            selected_non_spoilers = i['review_sentences']
            ones_minus_zeros -= len(i['review_sentences'])
        copy_to_upload['review_sentences'] = selected_non_spoilers
        ratingsTrain.append(copy_to_upload)

ratingsTest = data[train_size:train_size + test_size]
tot_sent = 0
for i in ratingsTrain:
    tot_sent += len(i['review_sentences'])
print("Training Data Size: ", tot_sent)
tot_sent = 0
for i in ratingsTest:
    tot_sent += len(i['review_sentences'])
print("Testing Data Size: ", tot_sent)


usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerItem = defaultdict(list)
reviewsPerUser = defaultdict(list)
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

for d in data:
    usersPerItem[d['book_id']].add(d['user_id'])
    itemsPerUser[d['user_id']].add(d['book_id'])
    ratingsPerUser[d['user_id']].append(d['rating'])
    ratingsPerItem[d['book_id']].append(d['rating'])
    for i in d['review_sentences']:
        reviewsPerItem[d['book_id']].append(i[0])
        reviewsPerUser[d['user_id']].append(i[0])

bookCount = defaultdict(int)
totalRead = 0

for d in data:
    bookCount[d['book_id']] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()
PopularityBooks = [x[-1] for x in mostPopular]

mostPopularUsers = [[len(value), key] for key, value in itemsPerUser.items()]
mostPopularUsers.sort()
mostPopularUsers.reverse()
PopularityUsers = [x[-1] for x in mostPopularUsers]

userIDs, itemIDs = {}, {}
for d in data:
    if d['user_id'] not in userIDs: userIDs[d['user_id']] = len(userIDs)
    if d['book_id'] not in itemIDs: itemIDs[d['book_id']] = len(itemIDs)

nUsers, nItems = len(userIDs), len(itemIDs)
Xui = scipy.sparse.lil_matrix((nUsers, nItems))
for d in data:
    Xui[userIDs[d['user_id']], itemIDs[d['book_id']]] = 1 

Xui_csr = scipy.sparse.csr_matrix(Xui)

book_popularity = np.sum(Xui_csr, axis=0).A1
books_read_by_user = Xui_csr.sum(axis=1).A1

spoiler_flat = [sum(sublist) for sublist in reviewsPerItem.values()]
avg_ct_book_spoilers = sum(spoiler_flat) / len(spoiler_flat) if spoiler_flat else 0
spoiler_flat_2 = [sum(sublist) for sublist in reviewsPerUser.values()]
avg_ct_user_spoilers = sum(spoiler_flat_2) / len(spoiler_flat_2) if spoiler_flat_2 else 0
flattened_1 = [item for sublist in ratingsPerItem.values() for item in sublist]
ovr_avg_rating_item =  sum(flattened_1) / len(flattened_1) if flattened_1 else 0

users = set(userIDs.keys())
books = set(itemIDs.keys())

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval() 

def get_bert_embeddings(sentences):
    tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**tokens)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
    return cls_embeddings.numpy()

def precompute_embeddings(data):
    all_sentences = [sentence[1] for d in data for sentence in d['review_sentences']]
    batch_size = 32
    all_embeddings = []
    for i in range(0, len(all_sentences), batch_size):
        if i % (1280) == 0: 
            print(i)
        batch = all_sentences[i:i + batch_size]
        batch_embeddings = get_bert_embeddings(batch)
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)

# Precompute embeddings for the training and testing sets
#train_embeddings = precompute_embeddings(ratingsTrain + ratingsTest)
#np.save('train_embeddings.npy', train_embeddings)  # Save embeddings for later use
# Load precomputed embeddings
train_embeddings = np.load('train_embeddings.npy')
#test_embeddings = precompute_embeddings(ratingsTest)
#np.save('test_embeddings.npy', test_embeddings)  # Save embeddings for later use
test_embeddings = np.load('test_embeddings.npy')

# Create data vectors
def create_vector(review):
    global users, books, avg_ct_book_spoilers, avg_ct_user_spoilers, ovr_avg_rating_item

    u = review['user_id']
    b = review['book_id']
    sentence = review['BERT_data']
    rating = review['rating']
    length = review['length']

    # Check if already seen book and/or user
    book_in = b in books
    user_in = u in users

    if len(sentence) == 0:
        sentence = np.zeros(768)  
    temporal_feature = 0 
    user_rating = rating if rating is not None else 0 

    # Create vector based on if already seen book and/or user
    if book_in and user_in:
        book_pop = book_popularity[itemIDs[b]]
        user_freq = books_read_by_user[userIDs[u]]
        pop_by_freq = book_pop * user_freq
        book_rank = PopularityBooks.index(b)
        
        ct_book_spoilers = sum(reviewsPerItem[b])
        ct_user_spoilers = sum(reviewsPerUser[u])
        avg_rating_item = ratingsPerItem[b]
        avg_rating_item = sum(avg_rating_item) / len(avg_rating_item)
        
    elif not user_in and not book_in:
        book_pop = len(books)
        user_freq = 0
        pop_by_freq = 0
        book_rank = len(books)
        
        ct_book_spoilers = avg_ct_book_spoilers
        ct_user_spoilers = avg_ct_user_spoilers
        avg_rating_item = ovr_avg_rating_item
        
    elif not user_in:
        book_pop = book_popularity[itemIDs[b]]
        user_freq = 0
        pop_by_freq = book_pop
        book_rank = PopularityBooks.index(b)
        
        ct_book_spoilers = sum(reviewsPerItem[b])
        ct_user_spoilers = avg_ct_user_spoilers
        avg_rating_item = ratingsPerItem[b]
        avg_rating_item = sum(avg_rating_item) / len(avg_rating_item)
        
    else:
        book_pop = 0
        user_freq = books_read_by_user[userIDs[u]]
        pop_by_freq = user_freq
        book_rank = len(PopularityBooks)
        
        ct_book_spoilers = avg_ct_book_spoilers
        ct_user_spoilers = sum(reviewsPerUser[u])
        avg_rating_item = ovr_avg_rating_item


    # Combine features
    vals = [
        1, 
        pop_by_freq, 
        avg_rating_item,
        book_rank,
        length,
        ct_book_spoilers,
        ct_user_spoilers
    ]
    # Add BERT embeddings
    vals.extend(sentence.tolist())
    
    return vals


{'user_id': '8842281e1d1347389f2ab93d60773d4d', 'timestamp': '2017-08-30', 'review_sentences': [[0, 'This is a special book.'], [0, 'It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind.'], [0, 'This is what I love about good science fiction - it pushes your thinking about where things can go.'], [0, "It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I've read."], [0, 'For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc.'], [0, 'It is a book about science, and aliens.'], [0, 'The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell.'], [1, 'Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though.'], [1, 'But what woul



In [2]:
# Prepare train, test data to be vectorized by extracting each sentence and its related data
x_train_pre = []
y_train = []
embedding_index = 0
for d in ratingsTrain:
    filtered_data = {key: value for key, value in d.items() if key != 'review_sentences' and key != 'has_spoiler'}
    for i, sentence in enumerate(d['review_sentences']):
        bert_embedding = train_embeddings[embedding_index] 
        filtered_data['BERT_data'] = bert_embedding
        filtered_data['position'] = i
        filtered_data['length'] = len(sentence)
        x_train_pre.append(filtered_data)
        y_train.append(sentence[0])
        embedding_index += 1

embedding_index = 0
x_valid_pre = []
y_valid = []
for d in ratingsTest:
    filtered_data = {key: value for key, value in d.items() if key != 'review_sentences' and key != 'has_spoiler'}
    for i, sentence in enumerate(d['review_sentences']):
        bert_embedding = test_embeddings[embedding_index] 
        filtered_data['BERT_data'] = bert_embedding
        filtered_data['position'] = i
        filtered_data['length'] = len(sentence)
        x_valid_pre.append(filtered_data)
        y_valid.append(sentence[0])
        embedding_index += 1
        
# Get data vectors
x_train = [create_vector(i) for i in x_train_pre]
x_valid = [create_vector(i) for i in x_valid_pre]

x_train = np.array(x_train)
x_valid = np.array(x_valid)

In [12]:
# Evaluation metrics function
def get_metric(y_true, pred, scores):
    corr = 0
    tn, tp, fn, fp = 0,0,0,0
    for guess, actual in zip(pred, y_true):
        if guess == actual:
            corr += 1
            if guess == 0:
                tn += 1
            else:
                tp += 1
        else:
            if guess == 0:
                fn += 1
            else:
                fp += 1
    if tp + fp == 0:
        precision = 0
        print("precision error, tp + fp = 0")
    else:
        precision = tp / (tp + fp)
    if tp + fn == 0:
        recall = 0
        print("recall error, tp + fn = 0")
    else:
        recall = tp / (tp + fn)
    print("accuracy: ", corr / len(pred))
    print("tp: ", tp, "tn: ", tn, "fp: ", fp, "fn: ", fn)
    # Calculate F1 Score
    if precision + recall == 0:
        print("F1 error, precision + recall = 0")
    else:
        print("F1: ", 2 * (precision * recall) / (precision + recall))
    
    auc = roc_auc_score(y_valid, scores)
    print(f"AUC: {auc}")

# Create and test model
mod = linear_model.LogisticRegression(penalty='l2', C=1, class_weight='balanced', solver='saga', max_iter=250)
mod.fit(x_train, y_train)
scores1 = mod.decision_function(x_train)
new_results1 = {index: score for index, score in enumerate(scores1)}     
sorted_items1 = sorted(new_results1.items(), key=lambda x: x[1], reverse=True)
train_pred = np.zeros(len(x_train))
for i in sorted_items1[:int(len(x_train)//2) + 1]:
    train_pred[i[0]] = 1
tn, tp, fn, fp = 0,0,0,0
for guess, actual in zip(train_pred, y_train):
    if guess == actual:
        if guess == 0:
            tn += 1
        else:
            tp += 1
    else:
        if guess == 0:
            fn += 1
        else:
            fp += 1
print("Training Info: TP ", tp, " TN ", tn, " FP ", fp, " FN ", fn)

scores = mod.decision_function(x_valid)
new_results = {index: score for index, score in enumerate(scores)}     
sorted_items = sorted(new_results.items(), key=lambda x: x[1], reverse=True)

pred = np.zeros(len(x_valid))
if len(x_valid) % 2 == 1:
    for i in sorted_items[:int(10 * len(x_valid)//100) + 1]:
        pred[i[0]] = 1
else:
    for i in sorted_items[:int(10 * len(x_valid)//100)]:
        pred[i[0]] = 1

curr_counter = 0
all_pos = 0
first_one = 0
last_one = 0
got_one = False
got_all = 0
while got_one < sum(y_valid) and curr_counter < len(sorted_items):
    if y_valid[sorted_items[curr_counter][0]] == 1:
        got_all += 1
        if got_all == sum(y_valid):
            last_one = curr_counter
        all_pos += curr_counter
        if got_one == False:
            got_one = True 
            first_one = curr_counter
    curr_counter += 1
get_metric(y_valid, pred, mod.predict_proba(x_valid)[:,1])
print("First spoiler position: ", first_one)
print("Last spoiler position: ", last_one)
print("Average spoiler position: ", all_pos / sum(y_valid))
thresholds = np.linspace(0, 1, 100)
best_f1 = 0
best_threshold = 0

for threshold in thresholds:
    pred = (scores >= threshold).astype(int)
    current_f1 = f1_score(y_valid, pred)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

print(f"Best threshold for F1: {best_threshold}, F1: {best_f1}")



Training Info: TP  20870  TN  20868  FP  4888  FN  4889
accuracy:  0.9118937048503611
tp:  334 tn:  13804 fp:  1216 fn:  150
F1:  0.32841691248770893
AUC: 0.8961211497617502
First spoiler position:  75
Last spoiler position:  12104
Average spoiler position:  1805.1797520661157
Best threshold for F1: 0.09090909090909091, F1: 0.33184302036761054
