<a href="https://colab.research.google.com/github/Pearlkakande/machinelearning/blob/main/M8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -r /content/bookrecommendations/requirements-file.txt


Ignoring pickle5: markers 'python_version < "3.8"' don't match your environment
Collecting streamlit>=1.10.0 (from -r /content/bookrecommendations/requirements-file.txt (line 8))
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting datasets>=2.0.0 (from -r /content/bookrecommendations/requirements-file.txt (line 9))
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting torch-geometric>=2.0.0 (from -r /content/bookrecommendations/requirements-file.txt (line 12))
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter>=2.0.0 (from -r /content/bookrecommendations/requirements-file.txt (line 13))
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Prepari

training the model

In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData, Data
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.utils import to_undirected
from transformers import DistilBertTokenizer, DistilBertModel
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

# Load the dataset
def load_data():
    try:
        from datasets import load_dataset
        dataset = load_dataset("Eitanli/goodreads")["train"]
        df = pd.DataFrame(dataset)
        print(f"Loaded dataset with {len(df)} books")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Preprocess the data
def preprocess_data(df):
    # Fill NA values
    df['Description'] = df['Description'].fillna('')
    df['Genres'] = df['Genres'].fillna('')

    # Convert Num_Ratings to numeric - handle both string and numeric formats
    if df['Num_Ratings'].dtype == object:  # Only apply string operations if it's an object dtype
        df['Num_Ratings'] = df['Num_Ratings'].apply(lambda x: str(x).replace(',', '')).astype(float)
    else:
        # It's already numeric, no need to convert
        pass

    # Create a combined text field for content-based filtering
    df['combined_features'] = df['Book'] + ' ' + df['Author'] + ' ' + df['Description'] + ' ' + df['Genres']

    return df

# Create feature embeddings using Sentence Transformer
def create_embeddings(df):
    print("Creating embeddings with Sentence Transformer...")
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Create embeddings for descriptions and store in the dataframe
    descriptions = df['Description'].tolist()
    desc_embeddings = model.encode(descriptions, show_progress_bar=True)

    # Create embeddings for book titles and authors
    titles = df['Book'].tolist()
    title_embeddings = model.encode(titles, show_progress_bar=True)

    # Create embeddings for genres
    genres = df['Genres'].tolist()
    genre_embeddings = model.encode(genres, show_progress_bar=True)

    # Save embeddings to disk
    embeddings = {
        'description': desc_embeddings,
        'title': title_embeddings,
        'genre': genre_embeddings
    }

    with open('book_embeddings.pkl', 'wb') as f:
        pickle.dump(embeddings, f)

    return embeddings

# Alternative: TF-IDF + SVD for text feature extraction (faster but less semantic)
def create_tfidf_features(df):
    print("Creating TF-IDF features...")
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    svd = TruncatedSVD(n_components=100)

    pipeline = Pipeline([
        ('tfidf', tfidf),
        ('svd', svd)
    ])

    features = pipeline.fit_transform(df['combined_features'])

    with open('tfidf_pipeline.pkl', 'wb') as f:
        pickle.dump(pipeline, f)

    return features

# Generate recommendations based on genres and description
def recommend_books(query_genre, query_description, df, embeddings=None, top_n=10):
    """
    Recommend books based on a query genre and description

    Parameters:
    query_genre (str): The genre preferred by the user
    query_description (str): A description of what the user is looking for
    df (DataFrame): The books dataframe
    embeddings (dict): Pre-computed embeddings dictionary
    top_n (int): Number of recommendations to return

    Returns:
    DataFrame: Top N recommended books
    """
    # Load model to encode queries
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode the queries
    genre_query_embedding = model.encode([query_genre])[0]
    desc_query_embedding = model.encode([query_description])[0]

    # If no pre-computed embeddings, create them now
    if embeddings is None:
        if os.path.exists('book_embeddings.pkl'):
            with open('book_embeddings.pkl', 'rb') as f:
                embeddings = pickle.load(f)
        else:
            embeddings = create_embeddings(df)

    # Calculate similarities
    genre_similarities = cosine_similarity([genre_query_embedding], embeddings['genre'])[0]
    desc_similarities = cosine_similarity([desc_query_embedding], embeddings['description'])[0]

    # Combine similarities (can adjust weights based on importance)
    combined_similarities = 0.6 * genre_similarities + 0.4 * desc_similarities

    # Get top N recommendations
    top_indices = combined_similarities.argsort()[-top_n:][::-1]

    # Return the recommended books
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = combined_similarities[top_indices]

    return recommendations[['Book', 'Author', 'Genres', 'Avg_Rating', 'similarity_score']]

# Alternative recommendation function using TF-IDF
def recommend_books_tfidf(query_genre, query_description, df, pipeline=None, features=None, top_n=10):
    # Load pipeline if not provided
    if pipeline is None:
        if os.path.exists('tfidf_pipeline.pkl'):
            with open('tfidf_pipeline.pkl', 'rb') as f:
                pipeline = pickle.load(f)
        else:
            features = create_tfidf_features(df)
            with open('tfidf_pipeline.pkl', 'rb') as f:
                pipeline = pickle.load(f)

    # Combine query inputs
    query = f"{query_genre} {query_description}"

    # Transform query using the same pipeline
    query_vec = pipeline.transform([query])

    # Calculate similarity
    if features is None:
        features = pipeline.transform(df['combined_features'])

    similarities = cosine_similarity(query_vec, features)[0]

    # Get top N recommendations
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return the recommended books
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = similarities[top_indices]

    return recommendations[['Book', 'Author', 'Genres', 'Avg_Rating', 'similarity_score']]

# Build a heterogeneous graph for GNN-based recommendation
def build_heterogeneous_graph(df):
    print("Building heterogeneous book graph...")
    data = HeteroData()

    # Create book nodes with description embeddings as features
    model_st = SentenceTransformer('all-MiniLM-L6-v2')
    df['desc_emb'] = df['Description'].fillna("").apply(lambda x: model_st.encode(x))
    book_emb = np.stack(df['desc_emb'].values)
    data['book'].x = torch.tensor(book_emb, dtype=torch.float)

    # Create mappings for authors and genres
    authors = list(df['Author'].unique())
    unique_genres = set()
    for genre_str in df['Genres'].fillna(""):
        genres = genre_str.split()
        for g in genres:
            if len(g) > 3:  # Filter out short words
                unique_genres.add(g.replace(',', ''))

    genres = list(unique_genres)
    author2id = {a: i for i, a in enumerate(authors)}
    genre2id = {g: i for i, g in enumerate(genres)}

    # Create author nodes
    data['author'].num_nodes = len(authors)
    data['author'].x = F.one_hot(torch.arange(len(authors)), num_classes=len(authors)).float()

    # Create genre nodes
    data['genre'].num_nodes = len(genres)
    data['genre'].x = F.one_hot(torch.arange(len(genres)), num_classes=len(genres)).float()

    # Build edges: book -> author and book -> genre
    book_author_edges = []
    for i, author in enumerate(df['Author']):
        if author in author2id:
            book_author_edges.append((i, author2id[author]))

    book_genre_edges = []
    for i, genre_str in enumerate(df['Genres'].fillna("")):
        genres = genre_str.split()
        for g in genres:
            g = g.replace(',', '')
            if g in genre2id:
                book_genre_edges.append((i, genre2id[g]))

    # Convert to tensors
    if book_author_edges:
        book_author_edges = torch.tensor(book_author_edges, dtype=torch.long).t()
        data['book', 'written_by', 'author'].edge_index = book_author_edges

    if book_genre_edges:
        book_genre_edges = torch.tensor(book_genre_edges, dtype=torch.long).t()
        data['book', 'has_genre', 'genre'].edge_index = book_genre_edges

    # Create book-to-book similarity edges based on description embeddings
    from sklearn.metrics.pairwise import cosine_similarity
    sim_matrix = cosine_similarity(book_emb)

    # Add edges for pairs with similarity > 0.8 (high threshold to keep graph sparse)
    sim_threshold = 0.8
    src, dst = np.where(sim_matrix > sim_threshold)

    # Remove self-loops
    mask = src != dst
    if np.any(mask):
        src_filtered = src[mask]
        dst_filtered = dst[mask]
        # Convert to numpy arrays first to avoid the slow list warning
        src_dst_array = np.vstack((src_filtered, dst_filtered))
        book_book_edges = torch.tensor(src_dst_array, dtype=torch.long)
        data['book', 'similar_to', 'book'].edge_index = book_book_edges

    # Add popularity features - safely handle numeric or already converted columns
    # No need to convert Num_Ratings since it should already be numeric from preprocess_data
    data['book'].ratings_count = torch.tensor(df['Num_Ratings'].fillna(0).values, dtype=torch.float)

    # Convert Avg_Rating to numeric if it's not already
    if df['Avg_Rating'].dtype == object:
        df['Avg_Rating'] = pd.to_numeric(df['Avg_Rating'], errors='coerce')
    data['book'].avg_rating = torch.tensor(df['Avg_Rating'].fillna(0).values, dtype=torch.float)

    # Store ID mappings with the graph
    data.author2id = author2id
    data.genre2id = genre2id
    data.id2author = {v: k for k, v in author2id.items()}
    data.id2genre = {v: k for k, v in genre2id.items()}

    return data, df

# Define GNN model for recommendation
class GNNRecommender(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=64):
        super(GNNRecommender, self).__init__()
        # Book encoder
        self.book_encoder = torch.nn.Sequential(
            GCNConv(in_channels, hidden_channels),
            torch.nn.ReLU(),
            GCNConv(hidden_channels, hidden_channels)
        )

        # Author encoder
        self.author_encoder = torch.nn.Sequential(
            GCNConv(in_channels, hidden_channels),
            torch.nn.ReLU()
        )

        # Genre encoder
        self.genre_encoder = torch.nn.Sequential(
            GCNConv(in_channels, hidden_channels),
            torch.nn.ReLU()
        )

        # Attention-based aggregation
        self.attn = GATConv(hidden_channels, hidden_channels, heads=2)

    def encode_books(self, data):
        # Extract embeddings for books
        book_x = data['book'].x

        # Check if the required edge_index exists
        if hasattr(data['book', 'similar_to', 'book'], 'edge_index'):
            book_x = self.book_encoder[0](book_x, data['book', 'similar_to', 'book'].edge_index)
            book_x = self.book_encoder[1](book_x)
            book_x = self.book_encoder[2](book_x, data['book', 'similar_to', 'book'].edge_index)
        else:
            # No book-book edges, just use linear layers instead
            print("Warning: No book-book edges found. Using linear transformation instead of GCN.")
            book_x = torch.nn.Linear(book_x.size(1), hidden_channels)(book_x)
            book_x = torch.nn.ReLU()(book_x)
            book_x = torch.nn.Linear(hidden_channels, hidden_channels)(book_x)

        return book_x

    def forward(self, data):
        # Process book, author, and genre nodes
        book_emb = self.encode_books(data)

        # You can add more complex message passing here

        return book_emb

# Train GNN model
def train_gnn_model(data, epochs=5):
    print("Training GNN recommendation model...")

    # Get the dimensions from the data
    if hasattr(data['book'], 'x'):
        in_channels = data['book'].x.size(1)
    else:
        print("Error: 'book' nodes don't have features")
        return None

    # Initialize model with proper dimensions
    model = GNNRecommender(in_channels=in_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Define a simple loss: books with similar features should have similar embeddings
    def loss_function(book_emb, edge_index):
        # Sample positive and negative pairs
        src, dst = edge_index
        pos_scores = (book_emb[src] * book_emb[dst]).sum(dim=1)

        # Create negative samples by shuffling destinations
        neg_dst = dst[torch.randperm(dst.size(0))]
        neg_scores = (book_emb[src] * book_emb[neg_dst]).sum(dim=1)

        # Contrastive loss: maximize positive scores, minimize negative scores
        loss = -torch.log(torch.sigmoid(pos_scores)).mean() - torch.log(1 - torch.sigmoid(neg_scores)).mean()
        return loss

    model.train()

    # Check if we have book-book edges
    if hasattr(data['book', 'similar_to', 'book'], 'edge_index'):
        for epoch in range(epochs):
            optimizer.zero_grad()
            book_emb = model.encode_books(data)
            loss = loss_function(book_emb, data['book', 'similar_to', 'book'].edge_index)
            loss.backward()
            optimizer.step()
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
    else:
        print("No book-book edges found for training. Using simpler approach.")
        # Simplified training without edges
        for epoch in range(epochs):
            optimizer.zero_grad()
            book_emb = model.encode_books(data)
            # Use MSE loss against original features as a simple alternative
            loss = F.mse_loss(book_emb, torch.nn.Linear(data['book'].x.size(1), book_emb.size(1))(data['book'].x))
            loss.backward()
            optimizer.step()
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

    return model

# Recommend books using GNN embeddings
def recommend_with_gnn(query_genre, query_description, model, data, df, top_n=10):
    # Encode the query with the same model used to create book embeddings
    st_model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = st_model.encode([f"{query_genre} {query_description}"])[0]
    query_tensor = torch.tensor(query_embedding, dtype=torch.float).unsqueeze(0)

    # Get book embeddings from GNN
    model.eval()
    with torch.no_grad():
        book_embeddings = model.encode_books(data).detach().numpy()

    # Calculate similarities
    similarities = cosine_similarity([query_embedding], book_embeddings)[0]

    # Get top N recommendations
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return the recommended books
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = similarities[top_indices]
    recommendations['method'] = 'Graph Neural Network'

    return recommendations[['Book', 'Author', 'Genres', 'Avg_Rating', 'similarity_score', 'method']]

# Save the model for deployment
def save_recommendation_model(df):
    # Pre-compute all required embeddings and features
    embeddings = create_embeddings(df)
    tfidf_features = create_tfidf_features(df)

    # Build and save the graph
    graph_data, processed_df = build_heterogeneous_graph(df)

    # Train the GNN model
    gnn_model = train_gnn_model(graph_data)

    # Save everything
    processed_df.to_csv('books_data.csv', index=False)

    with open('book_embeddings.pkl', 'wb') as f:
        pickle.dump(embeddings, f)

    with open('tfidf_features.pkl', 'wb') as f:
        pickle.dump(tfidf_features, f)

    with open('book_graph.pkl', 'wb') as f:
        pickle.dump(graph_data, f)

    if gnn_model:
        torch.save(gnn_model.state_dict(), 'gnn_model.pt')

    print("Recommendation models saved successfully!")

# Example of how to use the system
def main():
    # Load and preprocess data
    df = load_data()
    if df is not None:
        df = preprocess_data(df)

        # Save the model for deployment
        save_recommendation_model(df)

        # Example recommendation
        query_genre = "Science Fiction"
        query_description = "Looking for a book about space exploration and alien civilizations"

        print("\n===== Content-Based Recommendations =====")
        # Get recommendations
        recommendations = recommend_books(query_genre, query_description, df)
        print("Recommended Books:")
        print(recommendations)

        print("\n===== TF-IDF Recommendations =====")
        # Alternative recommendations using TF-IDF
        tfidf_recommendations = recommend_books_tfidf(query_genre, query_description, df)
        print("Recommendations using TF-IDF:")
        print(tfidf_recommendations)

        print("\n===== GNN-Based Recommendations =====")
        # Load graph and model for GNN recommendations
        try:
            with open('book_graph.pkl', 'rb') as f:
                graph_data = pickle.load(f)

            # Initialize GNN model with saved weights
            gnn_model = GNNRecommender(in_channels=graph_data['book'].x.size(1))
            gnn_model.load_state_dict(torch.load('gnn_model.pt'))

            # Get recommendations using GNN
            gnn_recommendations = recommend_with_gnn(query_genre, query_description, gnn_model, graph_data, df)
            print("Recommendations using Graph Neural Network:")
            print(gnn_recommendations)
        except Exception as e:
            print(f"Error loading GNN model: {e}")
            print("Run the model training first with save_recommendation_model()")

if __name__ == "__main__":
    main()

Repo card metadata block was not found. Setting CardData to empty.


Loaded dataset with 10000 books
Creating embeddings with Sentence Transformer...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Creating TF-IDF features...
Building heterogeneous book graph...
Training GNN recommendation model...
Epoch 1/5, Loss: 1.3713
Epoch 2/5, Loss: 1.2329
Epoch 3/5, Loss: 1.2941
Epoch 4/5, Loss: 1.2267
Epoch 5/5, Loss: 1.2036
Recommendation models saved successfully!

===== Content-Based Recommendations =====
Recommended Books:
                                                   Book  \
4721                       Veterans of the Psychic Wars   
6641    The Fyfield Plantation (Arcadia’s Children, #2)   
4794  The End of The Computer (Thunder Valley Trilog...   
9892                                           Epiphany   
9134                    Conundrum (Nine Inch Bride, #1)   
6190                             The Proximity of Stars   
9700  John Smith - Last Known Survivor of the Micros...   
6339                  The Starving Heart  (Darkeye, #3)   
6093                   The Genetic Lottery (Egalia, #1)   
9100    Under the Thelián Sky: Beyond the Great Unknown   

                        

  gnn_model.load_state_dict(torch.load('gnn_model.pt'))


Error loading GNN model: Incompatible dimension for X and Y matrices: X.shape[1] == 384 while Y.shape[1] == 64
Run the model training first with save_recommendation_model()


optimize GNN

In [5]:
from torch_geometric.data import HeteroData
import torch
import torch.nn.functional as F
import numpy as np
from sentence_transformers import SentenceTransformer

# Optimized graph building function for Colab
def build_optimized_graph(df, sample_frac=0.2):
    # Use only a subset of the data to save memory
    if sample_frac < 1.0:
        print(f"Using {sample_frac*100}% of the data for graph construction...")
        df_sample = df.sample(frac=sample_frac, random_state=42)
    else:
        df_sample = df

    print(f"Building graph with {len(df_sample)} books...")

    # from torch_geometric.data import HeteroData
    # import torch
    # import torch.nn.functional as F
    # import numpy as np
    # from sentence_transformers import SentenceTransformer

    data = HeteroData()

    # Use a smaller model for faster embedding
    model_st = SentenceTransformer('paraphrase-MiniLM-L3-v2')  # Smaller, faster model

    # Batch process the descriptions to save memory
    batch_size = 100
    n_batches = (len(df_sample) + batch_size - 1) // batch_size

    all_embeddings = []
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df_sample))
        batch_descriptions = df_sample['Description'].fillna("").iloc[start_idx:end_idx].tolist()
        batch_embeddings = model_st.encode(batch_descriptions)
        all_embeddings.append(batch_embeddings)
        print(f"Processed batch {i+1}/{n_batches}")

    # Concatenate all batches
    book_emb = np.vstack(all_embeddings)
    data['book'].x = torch.tensor(book_emb, dtype=torch.float)

    # Create simple one-hot encodings for authors and genres
    authors = list(df_sample['Author'].unique())
    author2id = {a: i for i, a in enumerate(authors)}

    # Simplify genre processing - just use the first word of each genre
    genres = []
    for genre_str in df_sample['Genres'].fillna(""):
        first_genre = genre_str.split()[0] if genre_str.split() else ""
        if first_genre and len(first_genre) > 3:
            genres.append(first_genre.replace(',', ''))

    unique_genres = list(set(genres))
    genre2id = {g: i for i, g in enumerate(unique_genres)}

    # Create smaller node features
    data['author'].num_nodes = len(authors)
    data['author'].x = torch.eye(len(authors), dtype=torch.float)

    data['genre'].num_nodes = len(unique_genres)
    data['genre'].x = torch.eye(len(unique_genres), dtype=torch.float)

    # Build edges with simplified approach
    book_author_edges = []
    for i, author in enumerate(df_sample['Author']):
        if author in author2id:
            book_author_edges.append((i, author2id[author]))

    book_genre_edges = []
    for i, genre_str in enumerate(df_sample['Genres'].fillna("")):
        first_genre = genre_str.split()[0] if genre_str.split() else ""
        if first_genre and first_genre.replace(',', '') in genre2id:
            book_genre_edges.append((i, genre2id[first_genre.replace(',', '')]))

    # Convert to tensors
    if book_author_edges:
        book_author_edges = torch.tensor(book_author_edges, dtype=torch.long).t()
        data['book', 'written_by', 'author'].edge_index = book_author_edges

    if book_genre_edges:
        book_genre_edges = torch.tensor(book_genre_edges, dtype=torch.long).t()
        data['book', 'has_genre', 'genre'].edge_index = book_genre_edges

    # Create book-to-book edges with higher threshold to reduce edges
    from sklearn.metrics.pairwise import cosine_similarity

    # Instead of computing full similarity matrix, use a sampling approach
    sample_size = min(1000, len(book_emb))
    indices = np.random.choice(len(book_emb), sample_size, replace=False)

    # Compute similarity only for the sample
    sample_emb = book_emb[indices]
    sim_matrix = cosine_similarity(sample_emb)

    # Add edges for pairs with high similarity
    sim_threshold = 0.9  # Higher threshold for fewer edges
    src, dst = np.where(sim_matrix > sim_threshold)

    # Remove self-loops
    mask = src != dst
    if np.any(mask):
        # Map back to original indices
        src_filtered = indices[src[mask]]
        dst_filtered = indices[dst[mask]]

        # Convert to numpy arrays first
        src_dst_array = np.vstack((src_filtered, dst_filtered))
        book_book_edges = torch.tensor(src_dst_array, dtype=torch.long)
        data['book', 'similar_to', 'book'].edge_index = book_book_edges

    # Add ratings features
    data['book'].ratings_count = torch.tensor(df_sample['Num_Ratings'].fillna(0).values, dtype=torch.float)
    data['book'].avg_rating = torch.tensor(df_sample['Avg_Rating'].fillna(0).values, dtype=torch.float)

    # Store mappings and the sampled dataframe
    data.author2id = author2id
    data.genre2id = genre2id

    return data, df_sample

# Simplified GNN model
class SimpleGNNRecommender(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=32):
        super(SimpleGNNRecommender, self).__init__()
        self.conv = torch.nn.Linear(in_channels, hidden_channels)
        self.conv2 = None

        # Initialize the second conv layer only if we have book-book edges
        self.hidden_channels = hidden_channels

    def encode_books(self, data):
        book_x = data['book'].x

        # Simple linear transformation first
        book_x = self.conv(book_x)
        book_x = torch.nn.functional.relu(book_x)

        # If we have book-book edges and the second conv isn't initialized
        if hasattr(data['book', 'similar_to', 'book'], 'edge_index'):
            if self.conv2 is None:
                from torch_geometric.nn import GCNConv
                self.conv2 = GCNConv(self.hidden_channels, self.hidden_channels)

            # Apply graph convolution
            book_x = self.conv2(book_x, data['book', 'similar_to', 'book'].edge_index)

        return book_x

    def forward(self, data):
        return self.encode_books(data)

# Simplified training function
def train_simple_gnn(data, epochs=3):
    print("Training simplified GNN model...")
    model = SimpleGNNRecommender(in_channels=data['book'].x.size(1))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        book_emb = model.encode_books(data)

        # Simple reconstruction loss
        if hasattr(data['book', 'similar_to', 'book'], 'edge_index'):
            # Use edge indices for a contrastive loss
            edge_index = data['book', 'similar_to', 'book'].edge_index
            src, dst = edge_index[0], edge_index[1]

            # Positive pairs (books that are similar)
            pos_scores = (book_emb[src] * book_emb[dst]).sum(dim=1)

            # Random negative pairs
            neg_src = src
            neg_dst = torch.randperm(data['book'].x.size(0))[:len(src)]
            neg_scores = (book_emb[neg_src] * book_emb[neg_dst]).sum(dim=1)

            # Contrastive loss
            loss = -torch.log(torch.sigmoid(pos_scores) + 1e-6).mean() - torch.log(1 - torch.sigmoid(neg_scores) + 1e-6).mean()
        else:
            # No edges, use a simple autoencoder approach
            reconstruction = torch.nn.Linear(model.hidden_channels, data['book'].x.size(1)).to(book_emb.device)(book_emb)
            loss = torch.nn.MSELoss()(reconstruction, data['book'].x)

        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

    return model

# Recommendation function for GNN
def recommend_with_simple_gnn(query_genre, query_description, model, data, df, top_n=10):
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity

    # Use the same smaller model as in graph construction
    st_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
    query_embedding = st_model.encode([f"{query_genre} {query_description}"])[0]

    # Get book embeddings from GNN
    model.eval()
    with torch.no_grad():
        book_embeddings = model.encode_books(data).detach().numpy()

    # Calculate similarities
    similarities = cosine_similarity([query_embedding], book_embeddings)[0]

    # Get top N recommendations
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return the recommended books - ensure df is the same as was used for the graph
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = similarities[top_indices]
    recommendations['method'] = 'Graph Neural Network'

    return recommendations[['Book', 'Author', 'Genres', 'Avg_Rating', 'similarity_score', 'method']]

Execute The Optimized Graph Construction

In [9]:
import pandas as pd
import torch
import os

# Assuming your data is in a CSV file named 'books.csv'
df = pd.read_csv('/content/bookrecommendations/books_data.csv')  # Load your DataFrame here

# Only use a small sample of the data to save memory
sample_fraction = 0.1  # Use 10% of the data

# Build the graph
graph_data, graph_df = build_optimized_graph(df, sample_frac=sample_fraction)
print(f"Graph built with {graph_data['book'].x.size(0)} book nodes")

# Check if we have book-book edges
if hasattr(graph_data['book', 'similar_to', 'book'], 'edge_index'):
    print(f"Graph has {graph_data['book', 'similar_to', 'book'].edge_index.size(1)} book-book edges")
else:
    print("Graph has no book-book edges")

# Train the model
gnn_model = train_simple_gnn(graph_data, epochs=3)

# Save the model and graph
torch.save(gnn_model.state_dict(), 'simple_gnn_model.pt')
with open('simple_book_graph.pkl', 'wb') as f:
    import pickle
    pickle.dump(graph_data, f)

print("GNN model and graph saved successfully!")

Using 10.0% of the data for graph construction...
Building graph with 1000 books...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processed batch 1/10
Processed batch 2/10
Processed batch 3/10
Processed batch 4/10
Processed batch 5/10
Processed batch 6/10
Processed batch 7/10
Processed batch 8/10
Processed batch 9/10
Processed batch 10/10
Graph built with 1000 book nodes
Graph has 32 book-book edges
Training simplified GNN model...
Epoch 1/3, Loss: 1.2765
Epoch 2/3, Loss: 0.8641
Epoch 3/3, Loss: 0.4071
GNN model and graph saved successfully!
