In [None]:
import os
import json
import pickle
import random
from collections import Counter, defaultdict

k = 10

In [None]:
graph = json.load(open(os.path.join('/shared/data3/bowenj4/llm-graph-plugin/data/processed_data/goodreads', 'graph.json')))
print(graph.keys())
all_generated_data = {}

In [None]:
list(graph['book_nodes'].keys())[:4]

In [None]:
graph['book_nodes']['book-1333909']['features'].keys()

## 1-hop reasoning (easy)
### Who are the authors of book xxx?

In [None]:
import random

random.seed(2023)

question = "Who are the authors of the book {book_title}?"
answer = "{authors}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids[:k]:
    book_title = graph['book_nodes'][book_id]['features']['title']
    author_ids = graph['book_nodes'][book_id]['neighbors']['author']
    author_names = [graph['author_nodes'][author_id]['features']['name'] for author_id in author_ids]
    generated_data.append({"book_title": book_title, "authors": ', '.join(author_names)})

all_generated_data[(question, answer)] = generated_data

### What is the publisher of book xxx?

In [None]:
random.seed(2024)

question = "What is the publisher of the book {book_title}?"
answer = "{publisher}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    publisher_ids = graph['book_nodes'][book_id]['neighbors']['publisher']

    # There might be books without a publisher
    if publisher_ids:
        publisher_id = publisher_ids[0]  # Assuming one publisher per book
        publisher_name = graph['publisher_nodes'][publisher_id]['features']['name']
        generated_data.append({"book_title": book_title, "publisher": publisher_name})
    else:
        continue
    if len(generated_data) == k:
        break
# Store the generated data
all_generated_data[(question, answer)] = generated_data

###  Which shelves do we need to put book xxx on?

In [None]:
random.seed(2025)

question = "Which shelves do we need to put the book {book_title} on?"
answer = "{shelves}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids[:k]:
    book_title = graph['book_nodes'][book_id]['features']['title']
    shelves = graph['book_nodes'][book_id]['features']['popular_shelves']
    generated_data.append({"book_title": book_title, "shelves": ', '.join(shelves)})

all_generated_data[(question, answer)] = generated_data


###  What genre does the book xxx belong to?

In [None]:
random.seed(2026)

question = "What genre does the book {book_title} belong to?"
answer = "{genres}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    genres = graph['book_nodes'][book_id]['features']['genres']

    if genres:
        generated_data.append({"book_title": book_title, "genres": ', '.join(genres)})
    else:
        continue
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### In which series is the book xxx included?

In [None]:
random.seed(2027)

question = "In which series is the book {book_title} included?"
answer = "{series_title}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    series_ids = graph['book_nodes'][book_id]['neighbors']['series']

    if series_ids:
        # Assuming one series per book for simplicity
        series_titles = [graph['series_nodes'][series_id]['features']['title'] for series_id in series_ids]
        generated_data.append({"book_title": book_title, "series_title": ', '.join(series_titles)})
    else:
        continue
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### What is the publication year of book xxx?

In [None]:
random.seed(2028)

question = "What is the publication year of the book {book_title}?"
answer = "{publication_year}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    publication_year = graph['book_nodes'][book_id]['features']['publication_year']

    if publication_year:
        generated_data.append({"book_title": book_title, "publication_year": publication_year})
    else:
        continue
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data


### How many pages does the book xxx have?

In [None]:
random.seed(2029)

question = "How many pages does the book {book_title} have?"
answer = "{num_pages}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    num_pages = graph['book_nodes'][book_id]['features']['num_pages']

    if num_pages:
        generated_data.append({"book_title": book_title, "num_pages": num_pages})
    else:
        continue
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### Is the book xxx an eBook?

In [None]:
random.seed(2030)

question = "Is the book {book_title} an eBook?"
answer = "{is_ebook}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    is_ebook = graph['book_nodes'][book_id]['features']['is_ebook']
    generated_data.append({"book_title": book_title, "is_ebook": is_ebook})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### What language is the book xxx written in?

In [None]:
random.seed(2031)

question = "What language is the book {book_title} written in?"
answer = "{language_code}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    language_code = graph['book_nodes'][book_id]['features']['language_code']

    if language_code:
        generated_data.append({"book_title": book_title, "language_code": language_code})
    else:
        continue
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

## Multi-hop reasoning (medium)
### Find the book written by the same author and published by the same publisher as book xxx

In [None]:
random.seed(2032)

# Preprocessing: Create hash maps for quick lookup
author_to_books = {}
publisher_to_books = {}

for book_id, book in graph['book_nodes'].items():
    for author_id in book['neighbors']['author']:
        author_to_books.setdefault(author_id, []).append(book_id)
    if book['neighbors']['publisher']:
        publisher_id = book['neighbors']['publisher'][0]
        publisher_to_books.setdefault(publisher_id, []).append(book_id)

# Generating the question-answer pairs
question = "Find books written by the same author and published by the same publisher as the book {book_title}?"
answer = "{matching_book_titles}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    target_book = graph['book_nodes'][book_id]
    target_book_title = target_book['features']['title']
    target_author_ids = target_book['neighbors']['author']
    target_publisher_ids = target_book['neighbors']['publisher']

    if not target_publisher_ids:
        continue

    matching_book_titles = []
    for author_id in target_author_ids:
        for other_book_id in author_to_books.get(author_id, []):
            other_book_title = graph['book_nodes'][other_book_id]['features']['title']
            if other_book_title != target_book_title and other_book_id in publisher_to_books.get(target_publisher_ids[0], []):
                matching_book_titles.append(other_book_title)
    
    matching_book_titles = list(set(matching_book_titles))
    if len(matching_book_titles) == 0 or len(matching_book_titles) > 30:
        continue

    if matching_book_titles:
        # generated_data.append({"book_title": target_book_title, "matching_book_titles": matching_book_titles})
        generated_data.append({"book_title": target_book_title, "matching_book_titles": ', '.join(matching_book_titles)})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data


### Find books by the same author and share similar genre with book xxx

In [None]:
random.seed(2033)

question = "Find books by the same author and share similar genre with book {book_title}?"
answer = "{matching_book_title}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)
target_book_set = set()

for book_id in book_ids:
    target_book = graph['book_nodes'][book_id]
    target_book_title = target_book['features']['title']
    target_author_ids = target_book['neighbors']['author']
    target_genres = set(target_book['features']['genres'])

    if target_book_title in target_book_set:
        continue

    matching_book_titles = []
    for author_id in target_author_ids:
        author_books = graph['author_nodes'][author_id]['neighbors']['book']
        for author_book_id in author_books:
            author_book_title = graph['book_nodes'][author_book_id]['features']['title']
            if author_book_title != target_book_title:
                author_book = graph['book_nodes'][author_book_id]
                author_book_genres = set(author_book['features']['genres'])
                if target_genres & author_book_genres:
                    matching_book_titles.append(author_book['features']['title'])
    
    matching_book_titles = list(set(matching_book_titles))
    if len(matching_book_titles) == 0 or len(matching_book_titles) > 30:
        continue
    
    generated_data.append({"book_title": target_book_title, "matching_book_title": ', '.join(matching_book_titles)})
    #generated_data.append({"book_title": target_book_title, "matching_book_title": list(set(matching_book_titles))})
    target_book_set.add(target_book_title)

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### Find the earliest book written by the author of the book xxx.

In [None]:
random.seed(2034)

# Preprocessing: Create a mapping from authors to their books along with publication years
author_to_books_with_year = {}

for book_id, book in graph['book_nodes'].items():
    pub_year = book['features'].get('publication_year')
    if pub_year:
        for author_id in book['neighbors']['author']:
            author_to_books_with_year.setdefault(author_id, []).append((book_id, int(pub_year)))

# Sorting books for each author by year
for author_id in author_to_books_with_year:
    author_to_books_with_year[author_id].sort(key=lambda x: x[1])  # Sort by publication year

# Generating the question-answer pairs
question = "Find the earliest book written by the author of the book {book_title}?"
answer = "{earliest_book_title}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    target_book = graph['book_nodes'][book_id]
    target_book_title = target_book['features']['title']
    author_ids = target_book['neighbors']['author']

    earliest_books = []
    for author_id in author_ids:
        author_books = author_to_books_with_year.get(author_id, [])
        if author_books:
            earliest_book_id, _ = author_books[0]  # Get the earliest book by the author
            earliest_book_title = graph['book_nodes'][earliest_book_id]['features']['title']
            if earliest_book_title != target_book_title:  # Ensure it's not the target book itself
                earliest_books.append(earliest_book_title)

    if earliest_books:
        generated_data.append({"book_title": target_book_title, "earliest_book_title": ', '.join(earliest_books)})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### Find the series in which the same author as the book xxx has contributed, but the series is different from the book's series.

In [None]:
random.seed(2035)

# Preprocessing: Create hash maps for quick lookup
author_to_series = {}

for book_id, book in graph['book_nodes'].items():
    for author_id in book['neighbors']['author']:
        for series_id in book['neighbors']['series']:
            author_to_series.setdefault(author_id, set()).add(series_id)

# Generating the question-answer pairs
question = "Find the series in which the same author as the book {book_title} has contributed, but the series is different from the book's series."
answer = "{series_titles}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    target_book = graph['book_nodes'][book_id]
    target_book_title = target_book['features']['title']
    target_author_ids = target_book['neighbors']['author']
    target_series_ids = set(target_book['neighbors']['series'])

    other_series_titles = set()
    for author_id in target_author_ids:
        author_series = author_to_series.get(author_id, set())
        for series_id in author_series:
            if series_id not in target_series_ids:
                series_title = graph['series_nodes'][series_id]['features']['title']
                other_series_titles.add(series_title)

    if other_series_titles:
        generated_data.append({"book_title": target_book_title, "series_titles": ', '.join(other_series_titles)})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

## Degree-based reasoning (easy)

### How many books has author xxx written?

In [None]:
random.seed(2036)

question = "How many books has author {author_name} written?"
answer = "{count}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    author_name = graph['author_nodes'][author_id]['features']['name']
    book_ids = graph['author_nodes'][author_id]['neighbors']['book']
    count = len(book_ids)

    if len(generated_data) == k:
        break

    generated_data.append({"author_name": author_name, "count": count})

all_generated_data[(question, answer)] = generated_data

### How many similar books does Book xxx have?

In [None]:
random.seed(2037)

question = "How many similar books does the book {book_title} have?"
answer = "{number_of_similar_books}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)

for book_id in book_ids:
    book_title = graph['book_nodes'][book_id]['features']['title']
    similar_books = graph['book_nodes'][book_id]['neighbors'].get('similar_books')
    if not similar_books:
        continue
    number_of_similar_books = len(similar_books)
    generated_data.append({"book_title": book_title, "number_of_similar_books": number_of_similar_books})
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### How many books does publisher xxx publish?

In [None]:
random.seed(2038)

question = "How many books does publisher {publisher_name} publish?"
answer = "{count}"
generated_data = []

publisher_ids = list(graph['publisher_nodes'].keys())
random.shuffle(publisher_ids)

for publisher_id in publisher_ids:
    publisher_name = graph['publisher_nodes'][publisher_id]['features']['name']
    book_ids = graph['publisher_nodes'][publisher_id]['neighbors']['book']
    count = len(book_ids)

    if len(generated_data) == k:
        break

    generated_data.append({"publisher_name": publisher_name, "count": count})

all_generated_data[(question, answer)] = generated_data

### How many books are part of the series {series_title}?

In [None]:
random.seed(2039)

question = "How many books are part of the series {series_title}?"
answer = "{count}"
generated_data = []

series_ids = list(graph['series_nodes'].keys())
random.shuffle(series_ids)

for series_id in series_ids:
    series_title = graph['series_nodes'][series_id]['features']['title']
    book_ids = graph['series_nodes'][series_id]['neighbors']['book']
    count = len(book_ids)

    if len(generated_data) == k:
        break

    generated_data.append({"series_title": series_title, "count": count})

all_generated_data[(question, answer)] = generated_data

## Complex structure reasoning (medium)

### How many authors have collaborated with the publisher {publisher_name}?

In [None]:
random.seed(2040)

question = "How many authors have collaborated with the publisher {publisher_name}?"
answer = "{count}"
generated_data = []

publisher_ids = list(graph['publisher_nodes'].keys())
random.shuffle(publisher_ids)

for publisher_id in publisher_ids:
    publisher_name = graph['publisher_nodes'][publisher_id]['features']['name']
    book_ids = graph['publisher_nodes'][publisher_id]['neighbors']['book']
    unique_authors = set()
    for book_id in book_ids:
        author_ids = graph['book_nodes'][book_id]['neighbors']['author']
        unique_authors.update(author_ids)
    count = len(unique_authors)

    if len(generated_data) == k:
        break

    generated_data.append({"publisher_name": publisher_name, "count": count})

all_generated_data[(question, answer)] = generated_data

### Which author has the most published books that have the same genre as the book {book_title}?

In [None]:
random.seed(2041)

# Preprocessing: Count the number of books each author has written in each genre
author_genre_counts = {}

for book_id, book in graph['book_nodes'].items():
    genres = set(book['features']['genres'])
    for author_id in book['neighbors']['author']:
        for genre in genres:
            author_genre_counts.setdefault((author_id, genre), 0)
            author_genre_counts[(author_id, genre)] += 1

# Generating the question-answer pairs
question = "Which author has the most published books that have the same genre as the book {book_title}?"
answer = "{author_name}"
generated_data = []

book_ids = list(graph['book_nodes'].keys())
random.shuffle(book_ids)
exist_answer = defaultdict(int)

for book_id in book_ids:
    book = graph['book_nodes'][book_id]
    book_title = book['features']['title']
    book_genres = set(book['features']['genres'])

    # Find the author with the most books in the same genres
    max_books = 0
    top_author_id = None
    for author_id in graph['author_nodes']:
        for genre in book_genres:
            count = author_genre_counts.get((author_id, genre), 0)
            if count > max_books:
                max_books = count
                top_author_id = author_id

    if top_author_id and exist_answer[top_author_id]<5:
        top_author_name = graph['author_nodes'][top_author_id]['features']['name']
        exist_answer[top_author_id] += 1
        generated_data.append({"book_title": book_title, "author_name": top_author_name})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### What is the most common publication format of books by author {author_name}?

In [None]:
random.seed(2042)

# Preprocessing: Count the frequency of each format for every author
author_to_format_frequency = {}

for book_id, book in graph['book_nodes'].items():
    for author_id in book['neighbors']['author']:
        format = book['features'].get('format')
        if format:
            author_to_format_frequency.setdefault((author_id, format), 0)
            author_to_format_frequency[(author_id, format)] += 1

# Generating the question-answer pairs
question = "What is the most common publication format of books by author {author_name}?"
answer = "{common_format}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    author_name = graph['author_nodes'][author_id]['features']['name']
    formats_frequency = {format: freq for (auth_id, format), freq in author_to_format_frequency.items() if auth_id == author_id}
    if formats_frequency:
        common_format = max(formats_frequency, key=formats_frequency.get)
    else:
        continue
    generated_data.append({"author_name": author_name, "common_format": common_format})
    if len(generated_data) == k:
        break
        
all_generated_data[(question, answer)] = generated_data

### What is the most frequent genre in the works of the author {author_name}?

In [None]:
random.seed(2043)
# Preprocessing: Count the frequency of each genre for every author
author_to_genre_frequency = {}

for book_id, book in graph['book_nodes'].items():
    for author_id in book['neighbors']['author']:
        genres = book['features']['genres']
        for genre in genres:
            author_to_genre_frequency.setdefault((author_id, genre), 0)
            author_to_genre_frequency[(author_id, genre)] += 1

question = "What is the most frequent genre in the works of the author {author_name}?"
answer = "{common_genre}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    author_name = graph['author_nodes'][author_id]['features']['name']
    genres_frequency = {genre: freq for (auth_id, genre), freq in author_to_genre_frequency.items() if auth_id == author_id}
    if genres_frequency:
        common_genre = max(genres_frequency, key=genres_frequency.get)
    else:
        continue

    generated_data.append({"author_name": author_name, "common_genre": common_genre})

    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

### Which publisher has released the majority of books in the genre {genre_name}?

In [None]:
random.seed(2044)
# Preprocessing: Count the number of books each publisher has released in each genre
publisher_genre_counts = {}

for book_id, book in graph['book_nodes'].items():
    genres = book['features']['genres']
    publisher_ids = book['neighbors']['publisher']
    for publisher_id in publisher_ids:
        for genre in genres:
            publisher_genre_counts.setdefault((publisher_id, genre), 0)
            publisher_genre_counts[(publisher_id, genre)] += 1

# Generating the question-answer pairs
question = "Which publisher has released the majority of books in the genre {genre_name}?"
answer = "{publisher_name}"
generated_data = []

genres = set()
for book in graph['book_nodes'].values():
    genres.update(book['features']['genres'])

for genre in genres:
    max_books = 0
    top_publisher_id = None
    for publisher_id in graph['publisher_nodes']:
        count = publisher_genre_counts.get((publisher_id, genre), 0)
        if count > max_books:
            max_books = count
            top_publisher_id = publisher_id

    if top_publisher_id:
        top_publisher_name = graph['publisher_nodes'][top_publisher_id]['features']['name']
        generated_data.append({"genre_name": genre, "publisher_name": top_publisher_name})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### What is the most common language among the books written by author {author_name}?

In [None]:
random.seed(2045)
# Preprocessing: Count the frequency of each language for every author
author_language_frequency = {}

for book_id, book in graph['book_nodes'].items():
    language = book['features'].get('language_code')
    if language:
        for author_id in book['neighbors']['author']:
            author_language_frequency.setdefault((author_id, language), 0)
            author_language_frequency[(author_id, language)] += 1

# Generating the question-answer pairs
question = "What is the most common language among the books written by author {author_name}?"
answer = "{language}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    author_name = graph['author_nodes'][author_id]['features']['name']
    languages_frequency = {language: freq for (auth_id, language), freq in author_language_frequency.items() if auth_id == author_id}

    if languages_frequency:
        common_language = max(languages_frequency, key=languages_frequency.get)
    else:
        continue
    generated_data.append({"author_name": author_name, "language": common_language})

    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

## Inductive reasoning (hard)
### Recommendation - What book should be recommended to the user based on his history: {book_titles}?

In [None]:
def date2num(date):
    MONTH2NUM = {'Jan': '01',
                 'Feb': '02',
                 'Mar': '03',
                 'Apr': '04',
                 'May': '05',
                 'Jun': '06',
                 'Jul': '07',
                 'Aug': '08',
                 'Sep': '09',
                 'Oct': '10',
                 'Nov': '11',
                 'Dec': '12'}

    _, month, day, time, _, year = date.split(' ')
    tmp_str = year + MONTH2NUM[month] + day + ''.join(time.split(':'))
    return int(tmp_str)

In [None]:
import json
import gzip
from collections import defaultdict
from tqdm import tqdm

# Function to load reviews
def load_reviews(file_path):
    user_history = defaultdict(list)
    with gzip.open(file_path, 'rb') as f:
        readin = f.readlines()
        for line in tqdm(readin):
            review = json.loads(line.decode('utf-8'))
            user_history[review['user_id']].append((review['date_added'], date2num(review['date_added']), review['book_id']))
    return user_history

# Load and preprocess reviews
user_history = load_reviews('/shared/data3/bowenj4/llm-graph-plugin/data/raw_data/goodreads/goodreads_reviews_dedup.json.gz')

In [None]:
# random.seed(2046)
# from collections import Counter
# # Function to find similar books for a given book
# def find_similar_books(book_id, graph):
#     return graph['book_nodes'][book_id]['neighbors']['similar_books']

# # Generating recommendations based on user history
# def generate_recommendations(user_book_ids, graph, top_n=5):
#     # Count the frequency of each similar book across all books in user history
#     similar_book_freq = Counter()
#     for book_id in user_book_ids:
#         similar_books = find_similar_books(book_id, graph)
#         similar_book_freq.update(similar_books)

#     # Select top N recommendations
#     top_recommendations = [book_id for book_id, _ in similar_book_freq.most_common(top_n)]
#     return top_recommendations

# # Question-answer pair generation
# question = "What book should be recommended to the user based on his history: {book_titles}?"
# answer = "{recommended_books}"
# generated_data = []

# random.shuffle(user_history)

# for use_id in tqdm(user_history):
#     if not user_history[use_id]:
#         continue
    
#     user_book_ids = ['book-' + book_id for _, _, book_id in user_history[use_id][-7:-1]]
#     user_book_titles = [graph['book_nodes'][book_id]['features']['title'] for book_id in user_book_ids]
#     recommended_book_ids = generate_recommendations(user_book_ids, graph, 100)
#     if user_history[use_id][-1] not in recommended_book_ids:
#         continue
    
#     recommended_book_title = 'book-' + graph['book_nodes'][user_history[use_id][-1]]['features']['title']
#     generated_data.append({"book_titles": ', '.join(user_book_titles), "recommended_books": recommended_book_title})
#     print(len(generated_data))
    
#     #recommended_book_titles = [graph['book_nodes'][book_id]['features']['title'] for book_id in recommended_book_ids]
#     #generated_data.append({"book_titles": ', '.join(user_book_titles), "recommended_books": ', '.join(recommended_book_titles)})
#     if len(generated_data) == k:
#         break
# all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2046)

question = "What book should be recommended to the user based on his history: {book_titles}?"
answer = "{targe_book_title}"
generated_data = []

user_ids = list(user_history.keys())
random.shuffle(user_ids)

for user_id in user_ids:
    tmp_history = user_history[user_id]
    tmp_history.sort(key=lambda x: x[1])
    
    if len(tmp_history) < 2:
        continue

    book_titles = [graph['book_nodes']['book-'+idd[-1]]['features']['title'] for idd in tmp_history[-6:-1]]
    targe_book_title = graph['book_nodes']['book-'+tmp_history[-1][-1]]['features']['title']

    generated_data.append({"book_titles": book_titles, "targe_book_title": targe_book_title})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
## save
pickle.dump(all_generated_data, open(os.path.join(f'preprocess_samples.pkl'), 'wb'))

print('Saving file of #questions, ', len(all_generated_data))