# Data Preparation
find number of suitable book

## load data

In [2]:
import json
import pandas as pd

# Load the goodreads_book_works.json file (JSONL format - one JSON object per line)
data = []
with open('./data/goodreads_book_works.json', 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            data.append(json.loads(line))

# Convert to DataFrame for easier analysis
df = pd.DataFrame(data)

# Display the shape and columns
print(f"Total records: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

Total records: 1521962
Columns: ['books_count', 'reviews_count', 'original_publication_month', 'default_description_language_code', 'text_reviews_count', 'best_book_id', 'original_publication_year', 'original_title', 'rating_dist', 'default_chaptering_book_id', 'original_publication_day', 'original_language_id', 'ratings_count', 'media_type', 'ratings_sum', 'work_id']

First few rows:
  books_count reviews_count original_publication_month  \
0           1             6                          8   
1          22         10162                              
2           2           268                              
3          38         89252                          7   
4           2            49                              

  default_description_language_code text_reviews_count best_book_id  \
0                                                    1      5333265   
1                                                  741        25717   
2                                                 

## data inspect

In [17]:
# First, let's check the structure of the reviews file
print("Checking structure of goodreads_reviews_dedup.json...")
with open('./data/goodreads_reviews_dedup.json', 'r', encoding='utf-8') as f:
    for i in range(3):
        line = f.readline()
        if line.strip():
            review = json.loads(line)
            print(f"\nSample review {i+1}:")
            print(f"Keys: {review.keys()}")
            print(f"Content: {review}")
            break


Checking structure of goodreads_reviews_dedup.json...

Sample review 1:
Keys: dict_keys(['user_id', 'book_id', 'review_id', 'rating', 'review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'])
Content: {'user_id': '8842281e1d1347389f2ab93d60773d4d', 'book_id': '24375664', 'review_id': '5cd416f3efc3f944fce4ce2db2290d5e', 'rating': 5, 'review_text': "Mind blowingly cool. Best science fiction I've read in some time. I just loved all the descriptions of the society of the future - how they lived in trees, the notion of owning property or even getting married was gone. How every surface was a screen. \n The undulations of how society responds to the Trisolaran threat seem surprising to me. Maybe its more the Chinese perspective, but I wouldn't have thought the ETO would exist in book 1, and I wouldn't have thought people would get so over-confident in our primitive fleet's chances given you have to think that with superior science they would have weapons

## review count

In [14]:
# Count actual reviews per book from the reviews file
from collections import Counter

print("Counting actual reviews per book from goodreads_reviews_dedup.json...")

review_counts = Counter()
lines_processed = 0

with open('./data/goodreads_reviews_dedup.json', 'r', encoding='utf-8') as f:
    for line in f:
        lines_processed += 1
        if lines_processed % 100000 == 0:
            print(f"Processed {lines_processed} lines...")
        
        if line.strip():
            try:
                review = json.loads(line)
                book_id = int(review.get('book_id', -1))
                review_counts[book_id] += 1
            except:
                pass

print(f"Total lines processed: {lines_processed}\n")

# Find books with actual review count > 1250
target = 1250
books_above_target = {book_id: count for book_id, count in review_counts.items() 
                      if count > target}

print(f"Books with actual review count > {target}: {len(books_above_target)}\n")

print("=" * 100)
print(f"Top 20 books by actual review count (> {target}):\n")

# Sort and display top books
sorted_books = sorted(books_above_target.items(), key=lambda x: x[1], reverse=True)
for idx, (book_id, count) in enumerate(sorted_books[:20], 1):
    print(f"{idx:2d}. Book ID {book_id}: {count} reviews")

print("\n" + "=" * 100)
print(f"\nTotal books with > {target} reviews: {len(books_above_target)}")
print(f"Review count range: {min(books_above_target.values())} to {max(books_above_target.values())}")


Counting actual reviews per book from goodreads_reviews_dedup.json...
Processed 100000 lines...
Processed 200000 lines...
Processed 300000 lines...
Processed 400000 lines...
Processed 500000 lines...
Processed 600000 lines...
Processed 700000 lines...
Processed 800000 lines...
Processed 900000 lines...
Processed 1000000 lines...
Processed 1100000 lines...
Processed 1200000 lines...
Processed 1300000 lines...
Processed 1400000 lines...
Processed 1500000 lines...
Processed 1600000 lines...
Processed 1700000 lines...
Processed 1800000 lines...
Processed 1900000 lines...
Processed 2000000 lines...
Processed 2100000 lines...
Processed 2200000 lines...
Processed 2300000 lines...
Processed 2400000 lines...
Processed 2500000 lines...
Processed 2600000 lines...
Processed 2700000 lines...
Processed 2800000 lines...
Processed 2900000 lines...
Processed 3000000 lines...
Processed 3100000 lines...
Processed 3200000 lines...
Processed 3300000 lines...
Processed 3400000 lines...
Processed 3500000 lin

## genre similarity score analysis

In [15]:
# Find 8 books (4 pairs) with similar genres from books with actual review count > 1250
print("Loading genre data for books with actual review count > 1250...\n")

# Load genres for books with actual review count > 1250
books_above_1250_with_genres = {}
count = 0

with open('./data/goodreads_book_genres_initial.json', 'r', encoding='utf-8') as f:
    for line in f:
        count += 1
        if count % 500000 == 0:
            print(f"Loaded {count} genre records...")
        
        if line.strip():
            try:
                item = json.loads(line)
                book_id = int(item.get('book_id', -1))
                genres = item.get('genres', {})
                
                # Only include if book has genres and is in our books_above_target set
                if genres and book_id in books_above_target:
                    books_above_1250_with_genres[book_id] = {
                        'genres': genres,
                        'actual_review_count': books_above_target[book_id]
                    }
            except:
                pass

print(f"Loaded genres for {len(books_above_1250_with_genres)} books with > 1250 reviews\n")

# Function to calculate cosine similarity
def calculate_cosine_similarity(genres1, genres2):
    if not genres1 or not genres2:
        return 0
    
    import math
    all_genres = set(genres1.keys()) | set(genres2.keys())
    if len(all_genres) == 0:
        return 0
    
    vector1 = [genres1.get(genre, 0) for genre in all_genres]
    vector2 = [genres2.get(genre, 0) for genre in all_genres]
    
    dot_product = sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(v ** 2 for v in vector1))
    magnitude2 = math.sqrt(sum(v ** 2 for v in vector2))
    
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    
    return dot_product / (magnitude1 * magnitude2)

# Find all pairs with high similarity
print("Calculating cosine similarity scores for book pairs...\n")
high_similarity_pairs = []
book_ids_with_genres = list(books_above_1250_with_genres.keys())

# Limit comparisons for performance
for i in range(min(3000, len(book_ids_with_genres))):
    book1_id = book_ids_with_genres[i]
    genres1 = books_above_1250_with_genres[book1_id]['genres']
    
    for j in range(i + 1, min(i + 100, len(book_ids_with_genres))):
        book2_id = book_ids_with_genres[j]
        genres2 = books_above_1250_with_genres[book2_id]['genres']
        
        similarity = calculate_cosine_similarity(genres1, genres2)
        
        if similarity > 0.8:  # High similarity threshold
            high_similarity_pairs.append({
                'book1_id': book1_id,
                'book1_reviews': books_above_1250_with_genres[book1_id]['actual_review_count'],
                'book1_genres': genres1,
                'book2_id': book2_id,
                'book2_reviews': books_above_1250_with_genres[book2_id]['actual_review_count'],
                'book2_genres': genres2,
                'similarity': similarity
            })

# Sort by similarity (descending)
high_similarity_pairs_sorted = sorted(high_similarity_pairs, key=lambda x: x['similarity'], reverse=True)

print(f"Found {len(high_similarity_pairs_sorted)} pairs with cosine similarity > 0.8\n")

# Select 4 non-overlapping pairs
selected_pairs_above_1250 = []
used_ids = set()

for pair in high_similarity_pairs_sorted:
    if pair['book1_id'] not in used_ids and pair['book2_id'] not in used_ids:
        selected_pairs_above_1250.append(pair)
        used_ids.add(pair['book1_id'])
        used_ids.add(pair['book2_id'])
        
        if len(selected_pairs_above_1250) == 4:
            break

print("=" * 120)
print("Top 4 genre-similar pairs (8 books) with actual review count > 1250:\n")

for pair_num, pair in enumerate(selected_pairs_above_1250, 1):
    print(f"Pair {pair_num}: (Cosine Similarity Score: {pair['similarity']:.4f})")
    print(f"  Book 1 ID: {pair['book1_id']}")
    print(f"    Actual Reviews: {pair['book1_reviews']}")
    print(f"    Genres: {pair['book1_genres']}")
    print(f"  Book 2 ID: {pair['book2_id']}")
    print(f"    Actual Reviews: {pair['book2_reviews']}")
    print(f"    Genres: {pair['book2_genres']}")
    print()

# Extract the 8 book IDs
books_1250_above_ids = set()
for pair in selected_pairs_above_1250:
    books_1250_above_ids.add(pair['book1_id'])
    books_1250_above_ids.add(pair['book2_id'])

print(f"Selected 8 book IDs: {sorted(books_1250_above_ids)}")
print(f"Total similar pairs found: {len(high_similarity_pairs_sorted)}")


Loading genre data for books with actual review count > 1250...

Loaded 500000 genre records...
Loaded 500000 genre records...
Loaded 1000000 genre records...
Loaded 1000000 genre records...
Loaded 1500000 genre records...
Loaded 1500000 genre records...
Loaded 2000000 genre records...
Loaded 2000000 genre records...
Loaded genres for 726 books with > 1250 reviews

Calculating cosine similarity scores for book pairs...

Loaded genres for 726 books with > 1250 reviews

Calculating cosine similarity scores for book pairs...

Found 13207 pairs with cosine similarity > 0.8

Top 4 genre-similar pairs (8 books) with actual review count > 1250:

Pair 1: (Cosine Similarity Score: 1.0000)
  Book 1 ID: 22738563
    Actual Reviews: 1619
    Genres: {'non-fiction': 2304}
  Book 2 ID: 12609433
    Actual Reviews: 1285
    Genres: {'non-fiction': 836}

Pair 2: (Cosine Similarity Score: 1.0000)
  Book 1 ID: 13455782
    Actual Reviews: 1268
    Genres: {'young-adult': 4447, 'romance': 2014, 'fantasy,

## extract review
from chosen book

In [26]:
import json
from collections import Counter

# Extract reviews for the 8 books with actual review count > 1250
print(f"Extracting reviews for {len(books_1250_above_ids)} genre-similar books (actual review count > 1250)...\n")

reviews_1250_above = []
review_count = 0
lines_processed = 0

input_file = './data/goodreads_reviews_dedup.json'
output_file = './data/books_1250_above_reviews.json'

with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        lines_processed += 1

        # Progress print
        if lines_processed % 100000 == 0:
            print(f"Processed {lines_processed:,} lines, found {review_count:,} matches...")

        line = line.strip()
        if not line:
            continue

        try:
            review = json.loads(line)
        except json.JSONDecodeError:
            continue

        review_book_id = int(review.get('book_id', -1))

        if review_book_id in books_1250_above_ids:
            # Filter out reviews containing "..." (truncated reviews)
            review_text = review.get('review_text', '')
            if review_text and '...' not in review_text:
                reviews_1250_above.append(review)
                review_count += 1

print(f"\nTotal reviews found: {review_count:,}\n")

# ------------------------------------------
# SAVE AS VALID JSON ARRAY  (Fixes your red JSON)
# ------------------------------------------

with open(output_file, 'w', encoding='utf-8') as f:
    f.write('[\n')
    for idx, review in enumerate(reviews_1250_above):
        f.write(json.dumps(review, ensure_ascii=False))
        if idx < len(reviews_1250_above) - 1:
            f.write(',\n')
    f.write('\n]')

print(f"Successfully saved {review_count:,} reviews to '{output_file}'\n")

# ------------------------------------------
# SUMMARY BY BOOK
# ------------------------------------------

if review_count > 0:
    print("=" * 120)
    print("Summary of reviews by book:\n")

    book_review_counts = Counter()

    for review in reviews_1250_above:
        book_id = int(review.get('book_id', -1))
        book_review_counts[book_id] += 1

    # Display sorted by book_id
    for book_id in sorted(book_review_counts.keys()):
        count = book_review_counts[book_id]

        # Lookup genre pair info
        pair_info = ""
        for pair in selected_pairs_above_1250:
            if pair['book1_id'] == book_id:
                pair_info = f" | Genres: {pair['book1_genres']}"
                break
            elif pair['book2_id'] == book_id:
                pair_info = f" | Genres: {pair['book2_genres']}"
                break

        print(f"  Book ID {book_id}: {count} reviews{pair_info}")


Extracting reviews for 8 genre-similar books (actual review count > 1250)...

Processed 100,000 lines, found 71 matches...
Processed 100,000 lines, found 71 matches...
Processed 200,000 lines, found 142 matches...
Processed 200,000 lines, found 142 matches...
Processed 300,000 lines, found 225 matches...
Processed 300,000 lines, found 225 matches...
Processed 400,000 lines, found 309 matches...
Processed 400,000 lines, found 309 matches...
Processed 500,000 lines, found 398 matches...
Processed 500,000 lines, found 398 matches...
Processed 600,000 lines, found 507 matches...
Processed 600,000 lines, found 507 matches...
Processed 700,000 lines, found 602 matches...
Processed 700,000 lines, found 602 matches...
Processed 800,000 lines, found 692 matches...
Processed 800,000 lines, found 692 matches...
Processed 900,000 lines, found 779 matches...
Processed 900,000 lines, found 779 matches...
Processed 1,000,000 lines, found 859 matches...
Processed 1,000,000 lines, found 859 matches...


## JSON to CSV Converter

In [27]:
# Convert JSON to CSV
print("Converting books_1250_above_reviews.json to CSV format...\n")

# Create DataFrame from reviews
df_reviews = pd.DataFrame(reviews_1250_above)

# Select key columns for CSV
columns_to_keep = ['book_id', 'user_id', 'review_id', 'rating', 'review_text', 'date_added', 'date_updated', 'n_votes', 'n_comments']

# Keep only available columns
available_cols = [col for col in columns_to_keep if col in df_reviews.columns]
df_reviews_csv = df_reviews[available_cols]

# Save to CSV
csv_output_file = './data/books_1250_above_reviews.csv'
df_reviews_csv.to_csv(csv_output_file, index=False, encoding='utf-8')

print(f"Successfully converted to CSV!")
print(f"File: {csv_output_file}")
print(f"Total rows: {len(df_reviews_csv):,}")
print(f"Columns: {', '.join(available_cols)}\n")

# Display sample
print("=" * 120)
print("Sample of CSV data (first 5 rows):\n")
print(df_reviews_csv.head().to_string(index=False))

# File size comparison
import os
json_size = os.path.getsize('./data/books_1250_above_reviews.json') / (1024*1024)
csv_size = os.path.getsize(csv_output_file) / (1024*1024)

print(f"\n{'=' * 120}")
print(f"File Size Comparison:")
print(f"  JSON: {json_size:.2f} MB")
print(f"  CSV:  {csv_size:.2f} MB")
print(f"  Saved: {(json_size - csv_size):.2f} MB ({((json_size - csv_size) / json_size * 100):.1f}%)")


Converting books_1250_above_reviews.json to CSV format...

Successfully converted to CSV!
File: ./data/books_1250_above_reviews.csv
Total rows: 14,974
Columns: book_id, user_id, review_id, rating, review_text, date_added, date_updated, n_votes, n_comments

Sample of CSV data (first 5 rows):

 book_id                          user_id                        review_id  rating                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               review_text                     date_added                   date_updated  n_votes