In [1]:
from src.cloud_storage.redshift_connection import redshift_connection

connection = redshift_connection()

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# -------------------------
# Sample Feature Definitions
# -------------------------

# Batch size = 2
# 1. One-hot feature: genre (5 categories)
x_genre = torch.tensor([
    [0, 1, 0, 0, 0],  # genre = category 1
    [0, 0, 0, 1, 0],  # genre = category 3
], dtype=torch.float32)  # shape: [2, 5]

# 2. Learnable embedding: item_id (embedding dim = 8)
item_ids = torch.tensor([12, 45])  # example item indices
item_embedding_layer = nn.Embedding(num_embeddings=100, embedding_dim=8)
x_item_id = item_embedding_layer(item_ids)  # shape: [2, 8]

# 3. Pretrained embedding: text features (dim = 16)
x_text_embed = torch.tensor([
    [0.1] * 16,
    [0.2] * 16
], dtype=torch.float32)  # shape: [2, 16]

# -------------------------
# Combine all features
# -------------------------
x_all = torch.cat([x_genre, x_item_id, x_text_embed], dim=1)  # shape: [2, 5+8+16 = 29]

# -------------------------
# Item Tower (MLP)
# -------------------------
# item_tower = nn.Sequential(
#     nn.Linear(x_all.shape[1], 64),
#     nn.ReLU(),
#     nn.Linear(64, 32),  # final item embedding
# )

# item_vector = item_tower(x_all)  # shape: [2, 32]

# print("Final item embedding vector (per item):")
# print(item_vector)


In [3]:
x_all


tensor([[ 0.0000,  1.0000,  0.0000,  0.0000,  0.0000, -1.1790, -0.1624, -0.1445,
          0.3887,  0.3142, -0.5061,  1.8720, -0.5338,  0.1000,  0.1000,  0.1000,
          0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
          0.1000,  0.1000,  0.1000,  0.1000,  0.1000],
        [ 0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.2250, -0.1304, -1.1441,
          1.3484,  0.3573, -0.5984,  1.9021, -0.2330,  0.2000,  0.2000,  0.2000,
          0.2000,  0.2000,  0.2000,  0.2000,  0.2000,  0.2000,  0.2000,  0.2000,
          0.2000,  0.2000,  0.2000,  0.2000,  0.2000]], grad_fn=<CatBackward0>)

In [4]:
import torch
import torch.nn as nn

class SimpleTower(nn.Module):
    def __init__(self, input_dim, output_dim=32, dropout=0.2):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(64, output_dim)  # Output layer (no activation)
        )

    def forward(self, x):
        return self.model(x)


In [1]:
import torch
import torch.nn as nn
from torchviz import make_dot

# Define the shared tower class
class SimpleTower(nn.Module):
    def __init__(self, input_dim, output_dim=32, dropout=0.2):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# Define full model combining both towers
class TwoTowerModel(nn.Module):
    def __init__(self, user_dim, item_dim, embed_dim=32):
        super().__init__()
        self.user_tower = SimpleTower(user_dim, embed_dim)
        self.item_tower = SimpleTower(item_dim, embed_dim)

    def forward(self, user_x, item_x):
        user_vec = self.user_tower(user_x)
        item_vec = self.item_tower(item_x)
        # Combine via dot product
        score = (user_vec * item_vec).sum(dim=1, keepdim=True)  # shape: [batch_size, 1]
        return score


In [2]:
# Create dummy input
user_input = torch.randn(4, 64)  # batch_size x user_input_dim
item_input = torch.randn(4, 48)  # batch_size x item_input_dim

# Create and run model
model = TwoTowerModel(user_dim=64, item_dim=48)
output = model(user_input, item_input)

# # Visualize with torchviz
# dot = make_dot(output, params=dict(model.named_parameters()))
# dot.format = 'png'
# dot.render("two_tower_graph")  # creates two_tower_graph.png


In [3]:
from torch.utils.tensorboard import SummaryWriter
import torch

# Assuming TwoTowerModel and towers are already defined

# Create model
model = TwoTowerModel(user_dim=64, item_dim=48)

# Dummy inputs (use batch size >1 to avoid BatchNorm errors)
user_input = torch.randn(2, 64)
item_input = torch.randn(2, 48)

# Create a TensorBoard writer
writer = SummaryWriter(log_dir="runs/two_tower_example")

# Log the model graph
writer.add_graph(model, (user_input, item_input))
writer.close()


In [4]:
!tensorboard --logdir=runs


TensorFlow installation not found - running with reduced feature set.
W0805 18:46:34.738121 6145716224 plugin_event_accumulator.py:369] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
W0805 18:46:34.739761 6145716224 plugin_event_accumulator.py:369] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.20.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [5]:
model = SimpleTower(input_dim=64)

print(model)


SimpleTower(
  (model): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=64, out_features=32, bias=True)
  )
)


In [13]:
from torchinfo import summary

model = SimpleTower(input_dim=64)
summary(model, input_size=(1, 64))  # batch_size x input_dim


Layer (type:depth-idx)                   Output Shape              Param #
SimpleTower                              [1, 32]                   --
├─Sequential: 1-1                        [1, 32]                   --
│    └─Linear: 2-1                       [1, 128]                  8,320
│    └─BatchNorm1d: 2-2                  [1, 128]                  256
│    └─ReLU: 2-3                         [1, 128]                  --
│    └─Dropout: 2-4                      [1, 128]                  --
│    └─Linear: 2-5                       [1, 64]                   8,256
│    └─BatchNorm1d: 2-6                  [1, 64]                   128
│    └─ReLU: 2-7                         [1, 64]                   --
│    └─Dropout: 2-8                      [1, 64]                   --
│    └─Linear: 2-9                       [1, 32]                   2,080
Total params: 19,040
Trainable params: 19,040
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.02
Input size (MB): 0.00
Forward/back

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()
dummy_input = torch.randn(1, 64)
model = SimpleTower(input_dim=64)
writer.add_graph(model, dummy_input)
writer.close()


TypeError: TwoTowerModel.forward() missing 1 required positional argument: 'item_x'

In [19]:
!tensorboard --logdir=runs



TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.20.0 at http://localhost:6006/ (Press CTRL+C to quit)


OSError: [Errno 5] Input/output error

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Tower(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], output_dim=32, dropout=0.2):
        """
        Generic tower for user or item.

        Args:
        - input_dim (int): Input feature dimension
        - hidden_dims (list of int): Hidden layer dimensions
        - output_dim (int): Final output size
        - dropout (float): Dropout rate between layers
        """
        super(Tower, self).__init__()

        layers = []
        current_dim = input_dim

        # Hidden layers
        for h_dim in hidden_dims:
            layers.append(nn.Linear(current_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            current_dim = h_dim

        # Output layer
        layers.append(nn.Linear(current_dim, output_dim))

        self.tower = nn.Sequential(*layers)

    def forward(self, x):
        return self.tower(x)


In [6]:
from src.cloud_storage.redshift_connection import redshift_connection

connection = redshift_connection()

In [7]:
item_query = 'sql_files/item_query.sql'
book_df = connection.redshift_query_fetching_as_df(item_query)

2025-08-06 12:12:38,691 - side - DEBUG - Attempting to read SQL file: sql_files/item_query.sql
2025-08-06 12:12:38,696 - side - INFO - Successfully read SQL file: sql_files/item_query.sql
2025-08-06 12:12:41,283 - read_shift - INFO - Connected to Redshift successfully.
  df = pd.read_sql_query(query, conn)
2025-08-06 12:13:54,029 - read_shift - INFO - Query executed successfully, retrieved 12191 rows.
2025-08-06 12:13:54,032 - read_shift - INFO - Connection closed.


In [8]:
book_df.head()

Unnamed: 0,id,book_isbn,book_title,authors,book_series,publication_date,rights,illustrators,interactive,search_keywords,...,clicks_students,quality_clicks,quality_clicks_students,students_completed_book,students_completed_75_per_book,per_75_completed_unique_books,completion_rate,time_spent,total_pages,read_pages
0,210,9781638973744,Forces and Changes in Motion,"Christina,Earley",Physical Science,2022-02-01,World,,False,"Forces, Friction, Magnet, Motion, Physical Sci...",...,505.0,543.0,453.0,360.0,439.0,95.0,78.43,277827.0,13524.0,9573.0
1,212,9781427121141,Forensic Investigations of the Ancient Egyptians,"James,Bow",Forensic Footprints of Ancient Worlds,2018-09-25,World,,False,"Ancient Civilizations, Forensic Science, Egypt...",...,208.0,247.0,189.0,121.0,187.0,96.0,62.37,46343.0,10757.0,7613.0
2,453,9780778789222,Soccer in Action,"Niki,Walker",Sports in Action,1999-10-31,World,,False,,...,273.0,285.0,223.0,102.0,208.0,90.0,44.16,66151.0,11222.0,5775.0
3,4717,9781039625440,Scared (Pè) Bilingual Eng/Cre,"Amy,Culliford",,2022-08-15,World,,False,"Scared, emotion, yell, thunderstorm, cry, hug",...,165.0,140.0,131.0,60.0,117.0,85.0,43.8,6869.0,2561.0,1348.0
4,4808,9781532420764,Cody Eats / Cody Come,Brenda Ponnay,Cody the Dog Bilingual,2020-09-17,World,Brenda Ponnay,,"cody the dog, dog book, dog reader, dog beginn...",...,601.0,703.0,536.0,272.0,446.0,81.0,49.54,112017.0,4650.0,2627.0


In [25]:
book_df['title_plus_author'] = book_df.apply(lambda x:x['book_title'].lower()+' by '+x['authors'].lower(),axis=1)
book_df['long_description'].fillna('unk',inplace=True)
book_df['long_description'] = book_df.apply(lambda x:x['long_description'].lower(),axis=1)
book_df.columns

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_df['long_description'].fillna('unk',inplace=True)


Index(['id', 'book_isbn', 'book_title', 'authors', 'book_series',
       'publication_date', 'rights', 'illustrators', 'interactive',
       'search_keywords', 'top_hundred', 'book_type', 'long_description',
       'bestseller', 'editor_recommended', 'animated', 'top_twenty',
       'top_fifty', 'page_count', 'min_grade', 'max_grade',
       'readable_page_count', 'min_reading_age', 'max_reading_age',
       'read_along_audio', 'read_along_with_highlighting', 'orientation',
       'last_reading_page_number', 'book_format', 'language_book',
       'publisher_name', 'fiction_nonfiction', 'reading_skill_name',
       'theme_name', 'category_name', 'book_code', 'grade_name', 'book_code',
       'clicks', 'clicks_students', 'quality_clicks',
       'quality_clicks_students', 'students_completed_book',
       'students_completed_75_per_book', 'per_75_completed_unique_books',
       'completion_rate', 'time_spent', 'total_pages', 'read_pages',
       'title_plus_author'],
      dtype='object'

In [13]:
book_df_final.shape

(12191, 13)

In [11]:
columns = ['book_isbn', 'title_plus_author', 'book_series', 'book_type', 'long_description','min_grade', 'max_grade',
       'readable_page_count','fiction_nonfiction', 'reading_skill_name','theme_name', 'category_name','language_book']

book_df_final = book_df[columns]
book_df_final.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,fiction_nonfiction,reading_skill_name,theme_name,category_name,language_book
0,9781638973744,"forces and changes in motion by christina,earley",Physical Science,PDF,Do you wonder about the world around us? In th...,3,5,21,Non-Fiction,"Making Inferences, Illustrations or other Visu...","Fun Science, Technology",Science & Nature,English
1,9781427121141,forensic investigations of the ancient egyptia...,Forensic Footprints of Ancient Worlds,PDF,Can modern forensic tools help us uncover new ...,3,5,31,Non-Fiction,"Illustrations or other Visual Elements, Fact a...","History, Technology, Places of Interest","People & Places, Science & Nature",English
2,9780778789222,"soccer in action by niki,walker",Sports in Action,PDF,Goooaaaallllll! Crabtree scores with Soccer in...,3,5,31,Non-Fiction,"Making Inferences, Illustrations or other Visu...","Sports & Games, Fitness, Healthy Habits",Growing Up,English
3,9781039625440,"scared (pè) bilingual eng/cre by amy,culliford",,PDF,"In this book, young readers will learn to reco...",pk,1,13,Non-Fiction,,"Emotions & Feelings, Family & Friends",Growing Up,Haitian French Creole
4,9781532420764,cody eats / cody come by brenda ponnay,Cody the Dog Bilingual,PDF,Bilingual Spanish / English Language Edition C...,pk,1,5,Fiction,,"Funny Stories, Our Friends in Nature","Funny Stories, Animals",Spanish


In [12]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

def encode_column_with_sentence_transformer(df: pd.DataFrame, column: str, model_name: str = 'all-MiniLM-L6-v2') -> np.ndarray:
    """
    Encodes a column of text into embeddings using a sentence-transformer model.

    Args:
        df (pd.DataFrame): Input dataframe.
        column (str): Column name to encode.
        model_name (str): Pretrained sentence-transformers model name.

    Returns:
        np.ndarray: Array of shape (num_rows, embedding_dim)
    """
    model = SentenceTransformer(model_name)
    
    # Fill missing values
    texts = df[column].fillna("unk").astype(str).tolist()
    
    # Encode with model
    embeddings = model.encode(texts, show_progress_bar=True)
    
    return np.array(embeddings)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
emb = encode_column_with_sentence_transformer(book_df_final,'title_plus_author')
# Convert embeddings to DataFrame
emb_df = pd.DataFrame(emb, columns=[f"emb_title_author_{i}" for i in range(emb.shape[1])])

# Combine with book_id
book_embedding_df = pd.concat([book_df_final[["book_isbn"]], emb_df], axis=1)

# Save to file
# book_embedding_df.to_parquet("book_embeddings.parquet", index=False)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:10<00:00, 34.70it/s]


In [None]:
emb_desc = encode_column_with_sentence_transformer(book_df_final,'long_description')
# Convert embeddings to DataFrame
emb_desc_df = pd.DataFrame(emb_desc, columns=[f"emb_desc_{i}" for i in range(emb.shape[1])])

# Combine with book_id
long_description_df = pd.concat([book_df_final[["book_isbn"]], emb_desc_df], axis=1)

# Save to file
# book_embedding_df.to_parquet("book_embeddings.parquet", index=False)


In [29]:
long_description_df.head()

Unnamed: 0,book_isbn,emb_desc_0,emb_desc_1,emb_desc_2,emb_desc_3,emb_desc_4,emb_desc_5,emb_desc_6,emb_desc_7,emb_desc_8,...,emb_desc_374,emb_desc_375,emb_desc_376,emb_desc_377,emb_desc_378,emb_desc_379,emb_desc_380,emb_desc_381,emb_desc_382,emb_desc_383
0,9781638973744,0.013931,-0.008896,0.058679,0.112187,0.024721,0.006103,0.038774,-0.046619,0.032334,...,0.068244,0.094449,-0.024867,0.067868,-0.036735,-0.007186,0.044612,-0.092573,-0.056998,-0.019073
1,9781427121141,-0.093507,0.124757,-0.006754,-0.001654,-0.014084,-0.071832,-0.049502,-0.028421,-0.055279,...,0.043926,-0.049684,0.007033,0.091449,0.039405,-0.071128,0.1831,0.00555,0.047619,-0.049927
2,9780778789222,0.002869,0.049008,0.00829,-0.051905,-0.016251,0.05323,0.039608,0.007936,0.059096,...,0.015007,0.100513,-0.054278,0.026744,0.008615,0.064617,0.070474,-0.075082,-0.00096,0.117753
3,9781039625440,0.088831,0.010486,0.014458,0.084495,0.012768,0.08335,0.057772,0.015923,0.082917,...,0.070714,-0.009329,0.015837,0.071399,-0.027583,0.094227,0.018915,0.040795,-0.014076,0.019917
4,9781532420764,-0.056846,-0.069584,0.07933,0.018258,-0.105333,-0.03521,0.068424,-0.054729,-0.026249,...,0.079174,-0.019516,0.004677,-0.031867,0.024795,0.065822,0.110249,0.105666,0.005809,-0.061654


In [None]:
book_df_final['readable_page_count'] = book_df_final['readable_page_count']/50
book_df_final.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,fiction_nonfiction,reading_skill_name,theme_name,category_name,language_book
0,9781638973744,"forces and changes in motion by christina,earley",Physical Science,PDF,Do you wonder about the world around us? In th...,3,5,0.42,Non-Fiction,"Making Inferences, Illustrations or other Visu...","Fun Science, Technology",Science & Nature,English
1,9781427121141,forensic investigations of the ancient egyptia...,Forensic Footprints of Ancient Worlds,PDF,Can modern forensic tools help us uncover new ...,3,5,0.62,Non-Fiction,"Illustrations or other Visual Elements, Fact a...","History, Technology, Places of Interest","People & Places, Science & Nature",English
2,9780778789222,"soccer in action by niki,walker",Sports in Action,PDF,Goooaaaallllll! Crabtree scores with Soccer in...,3,5,0.62,Non-Fiction,"Making Inferences, Illustrations or other Visu...","Sports & Games, Fitness, Healthy Habits",Growing Up,English
3,9781039625440,"scared (pè) bilingual eng/cre by amy,culliford",,PDF,"In this book, young readers will learn to reco...",pk,1,0.26,Non-Fiction,,"Emotions & Feelings, Family & Friends",Growing Up,Haitian French Creole
4,9781532420764,cody eats / cody come by brenda ponnay,Cody the Dog Bilingual,PDF,Bilingual Spanish / English Language Edition C...,pk,1,0.1,Fiction,,"Funny Stories, Our Friends in Nature","Funny Stories, Animals",Spanish


In [40]:
book_df_final_v1['fiction_nonfiction'].value_counts()

KeyError: 'fiction_nonfiction'

In [42]:
book_df_final['book_type_binary'] = np.where(book_df.book_type == 'PDF',1,0)
book_df_final['fiction_nonfiction'].fillna('unk',inplace =True)
book_df_final_v1 = pd.get_dummies(book_df_final, columns=['fiction_nonfiction'], prefix='fn')
book_df_final_v1 = pd.get_dummies(book_df_final_v1, columns=['language_book'], prefix='lang')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df_final['book_type_binary'] = np.where(book_df.book_type == 'PDF',1,0)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_df_final['fiction_nonfiction'].fillna('unk',inplace =True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bo

In [43]:
book_df_final_v1.columns

Index(['book_isbn', 'title_plus_author', 'book_series', 'book_type',
       'long_description', 'min_grade', 'max_grade', 'readable_page_count',
       'reading_skill_name', 'theme_name', 'category_name', 'book_type_binary',
       'fiction_nonfiction_binary', 'fn_Fiction', 'fn_Non-Fiction', 'fn_unk',
       'lang_English', 'lang_French', 'lang_Haitian French Creole',
       'lang_Mandarin', 'lang_Portuguese', 'lang_Spanish'],
      dtype='object')

In [None]:
['book_isbn', 'book_series','min_grade', 'max_grade', 'reading_skill_name','theme_name', 'category_name']

In [None]:
import pandas as pd
import torch
import torch.nn as nn

def encode_column_with_vocab(series: pd.Series, col_name: str, existing_vocab=None):
    """
    Encodes a single categorical column for embedding.
    Returns encoded indices, vocab, inverse map.
    """
    series = series.fillna('unk').astype(str)

    if existing_vocab is None:
        unique_vals = sorted(series.unique().tolist())
        if 'unk' not in unique_vals:
            unique_vals.append('unk')
        vocab_map = {val: idx for idx, val in enumerate(unique_vals)}
    else:
        vocab_map = existing_vocab

    inverse_map = {idx: val for val, idx in vocab_map.items()}
    encoded_series = series.map(lambda x: vocab_map.get(x, vocab_map['unk']))
    encoded_series.name = f"{col_name}_idx"
    return encoded_series, vocab_map, inverse_map


def create_embedding_layer(vocab_size, embedding_dim):
    """
    Creates a PyTorch embedding layer with given vocab size and dimension.
    """
    return nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)


def get_embeddings_for_column(encoded_series, embedding_layer):
    """
    Returns embedding vectors for each value in the encoded column.
    """
    input_tensor = torch.tensor(encoded_series.tolist(), dtype=torch.long)
    return embedding_layer(input_tensor)


In [None]:
grade_list = ['pk', 'k', '1', '2', '3', '4', '5', '6', '7', '8']
grade_to_idx = {g: i for i, g in enumerate(grade_list)}

class ConvertRangeEmb():
    def __init__(self,data_list,emb_dim =4):
        self.grade_list = data_list
        self.num_grades = len(self.grade_list)
        self.grade_to_idx = {g: i for i, g in enumerate(grade_list)}
        self.grade_embedding = nn.Embedding(self.num_grades, emb_dim)

    def _get_range(self,min_g, max_g):
        start_idx = self.grade_to_idx[min_g]
        end_idx = self.grade_to_idx[max_g]
        return grade_list[start_idx:end_idx + 1]

    def convertor(self,min_g, max_g,):
        book_grades = self._get_range(min_g, max_g)
        num_grades = self.grade_list
        grade_indices = grade_indices = torch.tensor(
    [self.grade_to_idx[g] for g in book_grades],
    dtype=torch.long  # 🔧 this fixes the error
)

        embedded = self.grade_embedding(grade_indices)
        return embedded.mean(dim=0)  

# Get embeddings
  # shape: [3, 8]
# book_embedding =  

In [87]:
clas = convert_range_emb(grade_list,4)

In [88]:
clas._get_range('pk','2')
clas.convertor('pk','2')

tensor([ 0.1958, -0.8391, -0.4423, -0.9212], grad_fn=<MeanBackward1>)

In [89]:
book_df_final_v1.min_grade.isna().sum()

np.int64(0)

In [148]:
type(emb_df)

pandas.core.frame.DataFrame

In [149]:
# Initialize converter
grade_converter = ConvertRangeEmb(data_list=grade_list, emb_dim=4)


# Apply to each row
book_df_final_v1["grade_emb"] = book_df_final_v1.apply(
    lambda row: grade_converter.convertor(row["min_grade"], row["max_grade"]),
    axis=1
)

# Convert embedding column to separate columns
emb_df = pd.DataFrame(book_df_final_v1["grade_emb"].tolist(), columns=[f"grade_emb_{i}" for i in range(4)])

# Combine with original
df_final = pd.concat([book_df_final_v1.drop("grade_emb", axis=1), emb_df], axis=1)

# Save as Parquet
# df_final.to_parquet("book_with_grade_embeddings.parquet", index=False)



In [150]:
df_final.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,reading_skill_name,theme_name,...,lang_English,lang_French,lang_Haitian French Creole,lang_Mandarin,lang_Portuguese,lang_Spanish,grade_emb_0,grade_emb_1,grade_emb_2,grade_emb_3
0,9781638973744,"forces and changes in motion by christina,earley",Physical Science,PDF,Do you wonder about the world around us? In th...,3,5,0.42,"Making Inferences, Illustrations or other Visu...","Fun Science, Technology",...,True,False,False,False,False,False,"tensor(0.8714, grad_fn=<UnbindBackward0>)","tensor(-0.1184, grad_fn=<UnbindBackward0>)","tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)"
1,9781427121141,forensic investigations of the ancient egyptia...,Forensic Footprints of Ancient Worlds,PDF,Can modern forensic tools help us uncover new ...,3,5,0.62,"Illustrations or other Visual Elements, Fact a...","History, Technology, Places of Interest",...,True,False,False,False,False,False,"tensor(0.8714, grad_fn=<UnbindBackward0>)","tensor(-0.1184, grad_fn=<UnbindBackward0>)","tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)"
2,9780778789222,"soccer in action by niki,walker",Sports in Action,PDF,Goooaaaallllll! Crabtree scores with Soccer in...,3,5,0.62,"Making Inferences, Illustrations or other Visu...","Sports & Games, Fitness, Healthy Habits",...,True,False,False,False,False,False,"tensor(0.8714, grad_fn=<UnbindBackward0>)","tensor(-0.1184, grad_fn=<UnbindBackward0>)","tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)"
3,9781039625440,"scared (pè) bilingual eng/cre by amy,culliford",,PDF,"In this book, young readers will learn to reco...",pk,1,0.26,,"Emotions & Feelings, Family & Friends",...,False,False,True,False,False,False,"tensor(-0.3938, grad_fn=<UnbindBackward0>)","tensor(0.0132, grad_fn=<UnbindBackward0>)","tensor(-0.3345, grad_fn=<UnbindBackward0>)","tensor(0.4681, grad_fn=<UnbindBackward0>)"
4,9781532420764,cody eats / cody come by brenda ponnay,Cody the Dog Bilingual,PDF,Bilingual Spanish / English Language Edition C...,pk,1,0.1,,"Funny Stories, Our Friends in Nature",...,False,False,False,False,False,True,"tensor(-0.3938, grad_fn=<UnbindBackward0>)","tensor(0.0132, grad_fn=<UnbindBackward0>)","tensor(-0.3345, grad_fn=<UnbindBackward0>)","tensor(0.4681, grad_fn=<UnbindBackward0>)"


In [None]:
['book_isbn', 'book_series', 'reading_skill_name','theme_name', 'category_name']

In [155]:
class ThemeEmbedder(nn.Module):
    def __init__(self, df, col, embedding_dim):
        super().__init__()

        df['themes'] = df[col].fillna('unk').apply(lambda x: [t.strip().lower() for t in x.split(',') if t.strip()])

        all_themes = sorted(set(chain.from_iterable(df['themes'])))
        self.theme_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
        if 'unk' not in self.theme_to_idx:
            self.theme_to_idx['unk'] = len(self.theme_to_idx)

        df['theme_ids'] = df['themes'].apply(lambda theme_list: [self.theme_to_idx[t] for t in theme_list if t in self.theme_to_idx])

        self.embedding = nn.Embedding(len(self.theme_to_idx), embedding_dim)
        self.theme_ids_batch = df['theme_ids'].tolist()

        self.df = df  # Keep a reference if needed

    def forward(self):
        embeddings = []
        for theme_ids in self.theme_ids_batch:
            if not theme_ids:
                embeddings.append(torch.zeros(self.embedding.embedding_dim, requires_grad=True))  # Keep graph
            else:
                ids_tensor = torch.tensor(theme_ids, dtype=torch.long)
                emb = self.embedding(ids_tensor)
                pooled = emb.mean(dim=0)  # This tensor still has grad_fn
                embeddings.append(pooled)
        return embeddings  # List of tensors (not stacked)


In [156]:
type(df_final)

pandas.core.frame.DataFrame

In [157]:
zx = ThemeEmbedder(pd.DataFrame(df_final['theme_name']),'theme_name',8)

In [None]:
res_tensor = zx()

[tensor([ 1.0544, -0.7764, -0.2172, -0.7911,  0.2605, -1.9044,  0.1099, -0.0812],
        grad_fn=<MeanBackward1>),
 tensor([-0.5150, -1.0864, -0.3459, -0.8359,  0.4598,  0.1705,  0.5889,  0.7324],
        grad_fn=<MeanBackward1>),
 tensor([ 0.2772,  0.1951, -0.0494,  0.7112,  0.4024,  0.9126, -1.2159,  0.0738],
        grad_fn=<MeanBackward1>),
 tensor([ 0.1455, -1.0081,  0.7713,  0.7106,  0.1620, -0.1807,  0.5099,  0.7030],
        grad_fn=<MeanBackward1>),
 tensor([ 0.7255,  0.1336, -0.7817,  0.8739, -0.0754,  1.3320, -0.0857, -0.4478],
        grad_fn=<MeanBackward1>),
 tensor([ 0.0516, -0.1987, -1.0108,  1.1954,  0.9227,  0.0437, -0.3688,  0.2706],
        grad_fn=<MeanBackward1>),
 tensor([-0.4919,  0.6516, -0.9011,  0.1746, -0.7292,  0.5869,  0.3551, -0.5469],
        grad_fn=<MeanBackward1>),
 tensor([ 0.0578,  0.6440, -0.1595,  0.8692,  0.1641,  0.8108,  0.1317,  0.2230],
        grad_fn=<MeanBackward1>),
 tensor([-0.2577, -0.2929, -0.3825, -2.2485,  0.3441,  0.7425,  1.1694, 

In [162]:
# Step 1: Get embeddings
res_tensor = zx()  # or zx.forward()
# res_np = res_tensor.detach().cpu().numpy()  # ensure it's a NumPy array

# Step 2: Convert to DataFrame
embedding_dim = res_np.shape[1]
emb_df = pd.DataFrame(res_tensor, columns=[f'theme_emb_{i}' for i in range(embedding_dim)])

# Step 3: Concatenate
assert isinstance(df_final, pd.DataFrame), "df_final must be a DataFrame"
assert df_final.shape[0] == emb_df.shape[0], "Row count mismatch between df_final and embeddings"

df_final_v1 = pd.concat([df_final.reset_index(drop=True), emb_df], axis=1)

In [163]:
df_final_v1.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,reading_skill_name,theme_name,...,grade_emb_2,grade_emb_3,theme_emb_0,theme_emb_1,theme_emb_2,theme_emb_3,theme_emb_4,theme_emb_5,theme_emb_6,theme_emb_7
0,9781638973744,"forces and changes in motion by christina,earley",Physical Science,PDF,Do you wonder about the world around us? In th...,3,5,0.42,"Making Inferences, Illustrations or other Visu...","Fun Science, Technology",...,"tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)","tensor(1.0544, grad_fn=<UnbindBackward0>)","tensor(-0.7764, grad_fn=<UnbindBackward0>)","tensor(-0.2172, grad_fn=<UnbindBackward0>)","tensor(-0.7911, grad_fn=<UnbindBackward0>)","tensor(0.2605, grad_fn=<UnbindBackward0>)","tensor(-1.9044, grad_fn=<UnbindBackward0>)","tensor(0.1099, grad_fn=<UnbindBackward0>)","tensor(-0.0812, grad_fn=<UnbindBackward0>)"
1,9781427121141,forensic investigations of the ancient egyptia...,Forensic Footprints of Ancient Worlds,PDF,Can modern forensic tools help us uncover new ...,3,5,0.62,"Illustrations or other Visual Elements, Fact a...","History, Technology, Places of Interest",...,"tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)","tensor(-0.5150, grad_fn=<UnbindBackward0>)","tensor(-1.0864, grad_fn=<UnbindBackward0>)","tensor(-0.3459, grad_fn=<UnbindBackward0>)","tensor(-0.8359, grad_fn=<UnbindBackward0>)","tensor(0.4598, grad_fn=<UnbindBackward0>)","tensor(0.1705, grad_fn=<UnbindBackward0>)","tensor(0.5889, grad_fn=<UnbindBackward0>)","tensor(0.7324, grad_fn=<UnbindBackward0>)"
2,9780778789222,"soccer in action by niki,walker",Sports in Action,PDF,Goooaaaallllll! Crabtree scores with Soccer in...,3,5,0.62,"Making Inferences, Illustrations or other Visu...","Sports & Games, Fitness, Healthy Habits",...,"tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)","tensor(0.2772, grad_fn=<UnbindBackward0>)","tensor(0.1951, grad_fn=<UnbindBackward0>)","tensor(-0.0494, grad_fn=<UnbindBackward0>)","tensor(0.7112, grad_fn=<UnbindBackward0>)","tensor(0.4024, grad_fn=<UnbindBackward0>)","tensor(0.9126, grad_fn=<UnbindBackward0>)","tensor(-1.2159, grad_fn=<UnbindBackward0>)","tensor(0.0738, grad_fn=<UnbindBackward0>)"
3,9781039625440,"scared (pè) bilingual eng/cre by amy,culliford",,PDF,"In this book, young readers will learn to reco...",pk,1,0.26,,"Emotions & Feelings, Family & Friends",...,"tensor(-0.3345, grad_fn=<UnbindBackward0>)","tensor(0.4681, grad_fn=<UnbindBackward0>)","tensor(0.1455, grad_fn=<UnbindBackward0>)","tensor(-1.0081, grad_fn=<UnbindBackward0>)","tensor(0.7713, grad_fn=<UnbindBackward0>)","tensor(0.7106, grad_fn=<UnbindBackward0>)","tensor(0.1620, grad_fn=<UnbindBackward0>)","tensor(-0.1807, grad_fn=<UnbindBackward0>)","tensor(0.5099, grad_fn=<UnbindBackward0>)","tensor(0.7030, grad_fn=<UnbindBackward0>)"
4,9781532420764,cody eats / cody come by brenda ponnay,Cody the Dog Bilingual,PDF,Bilingual Spanish / English Language Edition C...,pk,1,0.1,,"Funny Stories, Our Friends in Nature",...,"tensor(-0.3345, grad_fn=<UnbindBackward0>)","tensor(0.4681, grad_fn=<UnbindBackward0>)","tensor(0.7255, grad_fn=<UnbindBackward0>)","tensor(0.1336, grad_fn=<UnbindBackward0>)","tensor(-0.7817, grad_fn=<UnbindBackward0>)","tensor(0.8739, grad_fn=<UnbindBackward0>)","tensor(-0.0754, grad_fn=<UnbindBackward0>)","tensor(1.3320, grad_fn=<UnbindBackward0>)","tensor(-0.0857, grad_fn=<UnbindBackward0>)","tensor(-0.4478, grad_fn=<UnbindBackward0>)"


In [140]:
len(res.tolist()[0])

8

In [142]:
emb_df

Unnamed: 0,theme_emb_0,theme_emb_1,theme_emb_2,theme_emb_3,theme_emb_4,theme_emb_5,theme_emb_6,theme_emb_7
0,1.478647,-0.232980,0.972367,0.490679,-0.868217,1.352461,1.851667,-0.927268
1,0.338558,0.293770,-0.080786,0.689703,-0.597078,1.113300,0.900955,-0.262956
2,0.937208,0.260429,-0.212252,-1.012132,-0.591905,0.556993,-0.057744,-0.263714
3,1.707266,-1.363771,-0.045674,-0.579151,1.561574,0.197671,-1.631947,-0.674348
4,-0.452168,0.429928,1.205995,-1.176528,-0.571090,-0.011463,0.447645,0.001398
...,...,...,...,...,...,...,...,...
12186,0.490188,-0.334432,0.615725,0.768195,0.462333,0.691781,0.276157,0.036429
12187,-0.893478,0.791431,0.534415,-0.265971,0.030795,0.743669,0.463191,0.911320
12188,0.308778,0.773680,0.946150,-1.138530,-0.755129,0.959309,0.937776,-0.008977
12189,-0.213463,0.370834,-0.079681,0.197614,-0.762243,0.633594,0.255516,-0.205568


In [165]:
res_tensor

[tensor([-1.0109, -0.2602, -0.8932,  0.1411], grad_fn=<MeanBackward1>),
 tensor([-1.2466, -0.1060, -0.7196, -0.0772], grad_fn=<MeanBackward1>),
 tensor([ 0.2050,  0.1505, -0.2406, -0.4005], grad_fn=<MeanBackward1>),
 tensor([ 0.2050,  0.1505, -0.2406, -0.4005], grad_fn=<MeanBackward1>),
 tensor([-0.1641, -1.6832,  0.7132,  0.4041], grad_fn=<MeanBackward1>),
 tensor([-0.5315, -0.2869,  0.0507,  0.4116], grad_fn=<MeanBackward1>),
 tensor([-0.6386,  0.0993, -0.3933, -0.3480], grad_fn=<MeanBackward1>),
 tensor([-0.1666, -0.9041,  0.0545,  0.3547], grad_fn=<MeanBackward1>),
 tensor([-1.0109, -0.2602, -0.8932,  0.1411], grad_fn=<MeanBackward1>),
 tensor([-0.1529, -0.3870, -0.2137,  0.6135], grad_fn=<MeanBackward1>),
 tensor([-0.6386,  0.0993, -0.3933, -0.3480], grad_fn=<MeanBackward1>),
 tensor([ 1.0904,  1.0579, -0.0975,  0.5895], grad_fn=<MeanBackward1>),
 tensor([-0.7746, -1.1094, -0.2718,  0.6255], grad_fn=<MeanBackward1>),
 tensor([-1.4822,  0.0482, -0.5460, -0.2956], grad_fn=<MeanBackw

In [170]:
zx = ThemeEmbedder(pd.DataFrame(df_final['book_isbn']),'book_isbn',16)
res_tensor = zx()  # or zx.forward()
# res_np = res_tensor.detach().cpu().numpy()  # ensure it's a NumPy array

# Step 2: Convert to DataFrame
embedding_dim = res_np.shape[1]
emb_df = pd.DataFrame(res_tensor, columns=[f'book_isbn_emb_{i}' for i in range(16)])

# Step 3: Concatenate
assert isinstance(df_final, pd.DataFrame), "df_final must be a DataFrame"
assert df_final.shape[0] == emb_df.shape[0], "Row count mismatch between df_final and embeddings"

df_final_v1 = pd.concat([df_final.reset_index(drop=True), emb_df], axis=1)

In [171]:
df_final_v1.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,reading_skill_name,theme_name,...,book_isbn_emb_6,book_isbn_emb_7,book_isbn_emb_8,book_isbn_emb_9,book_isbn_emb_10,book_isbn_emb_11,book_isbn_emb_12,book_isbn_emb_13,book_isbn_emb_14,book_isbn_emb_15
0,9781638973744,"forces and changes in motion by christina,earley",Physical Science,PDF,Do you wonder about the world around us? In th...,3,5,0.42,"Making Inferences, Illustrations or other Visu...","Fun Science, Technology",...,"tensor(0.6242, grad_fn=<UnbindBackward0>)","tensor(-0.9681, grad_fn=<UnbindBackward0>)","tensor(-1.9258, grad_fn=<UnbindBackward0>)","tensor(1.1471, grad_fn=<UnbindBackward0>)","tensor(0.9147, grad_fn=<UnbindBackward0>)","tensor(1.4615, grad_fn=<UnbindBackward0>)","tensor(-0.5936, grad_fn=<UnbindBackward0>)","tensor(0.2231, grad_fn=<UnbindBackward0>)","tensor(-1.2585, grad_fn=<UnbindBackward0>)","tensor(-1.5205, grad_fn=<UnbindBackward0>)"
1,9781427121141,forensic investigations of the ancient egyptia...,Forensic Footprints of Ancient Worlds,PDF,Can modern forensic tools help us uncover new ...,3,5,0.62,"Illustrations or other Visual Elements, Fact a...","History, Technology, Places of Interest",...,"tensor(-0.1980, grad_fn=<UnbindBackward0>)","tensor(-1.0799, grad_fn=<UnbindBackward0>)","tensor(-1.0615, grad_fn=<UnbindBackward0>)","tensor(-0.4259, grad_fn=<UnbindBackward0>)","tensor(-1.3419, grad_fn=<UnbindBackward0>)","tensor(0.3888, grad_fn=<UnbindBackward0>)","tensor(-0.8050, grad_fn=<UnbindBackward0>)","tensor(-0.0461, grad_fn=<UnbindBackward0>)","tensor(0.0926, grad_fn=<UnbindBackward0>)","tensor(-0.9766, grad_fn=<UnbindBackward0>)"
2,9780778789222,"soccer in action by niki,walker",Sports in Action,PDF,Goooaaaallllll! Crabtree scores with Soccer in...,3,5,0.62,"Making Inferences, Illustrations or other Visu...","Sports & Games, Fitness, Healthy Habits",...,"tensor(0.4712, grad_fn=<UnbindBackward0>)","tensor(0.7039, grad_fn=<UnbindBackward0>)","tensor(0.6822, grad_fn=<UnbindBackward0>)","tensor(-0.3006, grad_fn=<UnbindBackward0>)","tensor(-0.0947, grad_fn=<UnbindBackward0>)","tensor(0.8083, grad_fn=<UnbindBackward0>)","tensor(0.0425, grad_fn=<UnbindBackward0>)","tensor(0.8625, grad_fn=<UnbindBackward0>)","tensor(-1.0250, grad_fn=<UnbindBackward0>)","tensor(-0.2498, grad_fn=<UnbindBackward0>)"
3,9781039625440,"scared (pè) bilingual eng/cre by amy,culliford",,PDF,"In this book, young readers will learn to reco...",pk,1,0.26,,"Emotions & Feelings, Family & Friends",...,"tensor(-0.3142, grad_fn=<UnbindBackward0>)","tensor(0.8525, grad_fn=<UnbindBackward0>)","tensor(-0.3890, grad_fn=<UnbindBackward0>)","tensor(1.9732, grad_fn=<UnbindBackward0>)","tensor(1.5231, grad_fn=<UnbindBackward0>)","tensor(0.7937, grad_fn=<UnbindBackward0>)","tensor(2.0855, grad_fn=<UnbindBackward0>)","tensor(0.0427, grad_fn=<UnbindBackward0>)","tensor(0.9218, grad_fn=<UnbindBackward0>)","tensor(0.1840, grad_fn=<UnbindBackward0>)"
4,9781532420764,cody eats / cody come by brenda ponnay,Cody the Dog Bilingual,PDF,Bilingual Spanish / English Language Edition C...,pk,1,0.1,,"Funny Stories, Our Friends in Nature",...,"tensor(0.0668, grad_fn=<UnbindBackward0>)","tensor(0.1685, grad_fn=<UnbindBackward0>)","tensor(-0.1564, grad_fn=<UnbindBackward0>)","tensor(0.5935, grad_fn=<UnbindBackward0>)","tensor(-0.3035, grad_fn=<UnbindBackward0>)","tensor(-0.0429, grad_fn=<UnbindBackward0>)","tensor(0.5444, grad_fn=<UnbindBackward0>)","tensor(-0.7810, grad_fn=<UnbindBackward0>)","tensor(-0.3132, grad_fn=<UnbindBackward0>)","tensor(0.1558, grad_fn=<UnbindBackward0>)"


In [None]:
zx = ThemeEmbedder(pd.DataFrame(df_final['book_isbn']),'book_isbn',16)
res_tensor = zx()  # or zx.forward()
# res_np = res_tensor.detach().cpu().numpy()  # ensure it's a NumPy array

# Step 2: Convert to DataFrame
embedding_dim = res_np.shape[1]
emb_df = pd.DataFrame(res_tensor, columns=[f'book_isbn_emb_{i}' for i in range(16)])

# Step 3: Concatenate
assert isinstance(df_final, pd.DataFrame), "df_final must be a DataFrame"
assert df_final.shape[0] == emb_df.shape[0], "Row count mismatch between df_final and embeddings"

df_final_v1 = pd.concat([df_final.reset_index(drop=True), emb_df], axis=1)

In [None]:
['book_series', 'reading_skill_name']

In [174]:
zx = ThemeEmbedder(pd.DataFrame(df_final['reading_skill_name']),'reading_skill_name',4)
res_tensor = zx()  # or zx.forward()
# res_np = res_tensor.detach().cpu().numpy()  # ensure it's a NumPy array

# Step 2: Convert to DataFrame
embedding_dim = res_np.shape[1]
emb_df = pd.DataFrame(res_tensor, columns=[f'read_emb_{i}' for i in range(4)])

# Step 3: Concatenate
assert isinstance(df_final, pd.DataFrame), "df_final must be a DataFrame"
assert df_final.shape[0] == emb_df.shape[0], "Row count mismatch between df_final and embeddings"

df_final_v1 = pd.concat([df_final.reset_index(drop=True), emb_df], axis=1)

In [175]:
df_final_v1.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,reading_skill_name,theme_name,...,lang_Portuguese,lang_Spanish,grade_emb_0,grade_emb_1,grade_emb_2,grade_emb_3,read_emb_0,read_emb_1,read_emb_2,read_emb_3
0,9781638973744,"forces and changes in motion by christina,earley",Physical Science,PDF,Do you wonder about the world around us? In th...,3,5,0.42,"Making Inferences, Illustrations or other Visu...","Fun Science, Technology",...,False,False,"tensor(0.8714, grad_fn=<UnbindBackward0>)","tensor(-0.1184, grad_fn=<UnbindBackward0>)","tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)","tensor(-0.6101, grad_fn=<UnbindBackward0>)","tensor(0.1556, grad_fn=<UnbindBackward0>)","tensor(0.4481, grad_fn=<UnbindBackward0>)","tensor(0.0786, grad_fn=<UnbindBackward0>)"
1,9781427121141,forensic investigations of the ancient egyptia...,Forensic Footprints of Ancient Worlds,PDF,Can modern forensic tools help us uncover new ...,3,5,0.62,"Illustrations or other Visual Elements, Fact a...","History, Technology, Places of Interest",...,False,False,"tensor(0.8714, grad_fn=<UnbindBackward0>)","tensor(-0.1184, grad_fn=<UnbindBackward0>)","tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)","tensor(-0.4210, grad_fn=<UnbindBackward0>)","tensor(0.0188, grad_fn=<UnbindBackward0>)","tensor(0.6919, grad_fn=<UnbindBackward0>)","tensor(0.1112, grad_fn=<UnbindBackward0>)"
2,9780778789222,"soccer in action by niki,walker",Sports in Action,PDF,Goooaaaallllll! Crabtree scores with Soccer in...,3,5,0.62,"Making Inferences, Illustrations or other Visu...","Sports & Games, Fitness, Healthy Habits",...,False,False,"tensor(0.8714, grad_fn=<UnbindBackward0>)","tensor(-0.1184, grad_fn=<UnbindBackward0>)","tensor(0.0281, grad_fn=<UnbindBackward0>)","tensor(-1.2436, grad_fn=<UnbindBackward0>)","tensor(-0.7336, grad_fn=<UnbindBackward0>)","tensor(-0.0518, grad_fn=<UnbindBackward0>)","tensor(-0.2328, grad_fn=<UnbindBackward0>)","tensor(0.1401, grad_fn=<UnbindBackward0>)"
3,9781039625440,"scared (pè) bilingual eng/cre by amy,culliford",,PDF,"In this book, young readers will learn to reco...",pk,1,0.26,,"Emotions & Feelings, Family & Friends",...,False,False,"tensor(-0.3938, grad_fn=<UnbindBackward0>)","tensor(0.0132, grad_fn=<UnbindBackward0>)","tensor(-0.3345, grad_fn=<UnbindBackward0>)","tensor(0.4681, grad_fn=<UnbindBackward0>)","tensor(1.1115, grad_fn=<UnbindBackward0>)","tensor(-0.3705, grad_fn=<UnbindBackward0>)","tensor(0.6249, grad_fn=<UnbindBackward0>)","tensor(-1.4469, grad_fn=<UnbindBackward0>)"
4,9781532420764,cody eats / cody come by brenda ponnay,Cody the Dog Bilingual,PDF,Bilingual Spanish / English Language Edition C...,pk,1,0.1,,"Funny Stories, Our Friends in Nature",...,False,True,"tensor(-0.3938, grad_fn=<UnbindBackward0>)","tensor(0.0132, grad_fn=<UnbindBackward0>)","tensor(-0.3345, grad_fn=<UnbindBackward0>)","tensor(0.4681, grad_fn=<UnbindBackward0>)","tensor(1.1115, grad_fn=<UnbindBackward0>)","tensor(-0.3705, grad_fn=<UnbindBackward0>)","tensor(0.6249, grad_fn=<UnbindBackward0>)","tensor(-1.4469, grad_fn=<UnbindBackward0>)"


In [177]:

zx = ThemeEmbedder(pd.DataFrame(df_final['book_series']),'book_series',16)
res_tensor = zx()  # or zx.forward()
# res_np = res_tensor.detach().cpu().numpy()  # ensure it's a NumPy array

# Step 2: Convert to DataFrame
embedding_dim = res_np.shape[1]
emb_df = pd.DataFrame(res_tensor, columns=[f'book_series_emb_{i}' for i in range(16)])

# Step 3: Concatenate
assert isinstance(df_final, pd.DataFrame), "df_final must be a DataFrame"
assert df_final.shape[0] == emb_df.shape[0], "Row count mismatch between df_final and embeddings"

df_final_v1 = pd.concat([df_final.reset_index(drop=True), emb_df], axis=1)

In [2]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torch
import numpy as np
from src.cloud_storage.redshift_connection import redshift_connection

connection = redshift_connection()

In [3]:
item_query = 'sql_files/item_query.sql'
book_df = connection.redshift_query_fetching_as_df(item_query)
book_df.head()

2025-08-18 11:29:41,482 - side - DEBUG - Attempting to read SQL file: sql_files/item_query.sql
2025-08-18 11:29:41,484 - side - INFO - Successfully read SQL file: sql_files/item_query.sql
2025-08-18 11:29:44,013 - read_shift - INFO - Connected to Redshift successfully.
  df = pd.read_sql_query(query, conn)
2025-08-18 11:31:16,483 - read_shift - INFO - Query executed successfully, retrieved 12189 rows.
2025-08-18 11:31:16,484 - read_shift - INFO - Connection closed.


Unnamed: 0,id,book_isbn,book_title,authors,book_series,publication_date,rights,illustrators,interactive,search_keywords,...,clicks_students,quality_clicks,quality_clicks_students,students_completed_book,students_completed_75_per_book,per_75_completed_unique_books,completion_rate,time_spent,total_pages,read_pages
0,38,9781427121479,Animals of the World,"Toby,Reynolds",Quick-Reference Atlases,2018-08-10,World,,False,"Atlas, Animals, Includes index, Includes bolde...",...,1065.0,1133.0,875.0,349.0,773.0,85.0,38.61,783991.0,53414.0,23522.0
1,66,9781427194763,Bloodsucking Lice and Fleas,"Ellen,Rodger",Creepy Crawlies,2010-07-15,World,,False,,...,176.0,208.0,165.0,78.0,159.0,95.0,46.71,25660.0,7290.0,4910.0
2,107,9781427164292,Colonial Home,"Bobbie,Kalman",Historic Communities,2000-10-31,World,,False,,...,50.0,51.0,38.0,13.0,34.0,87.0,33.33,14578.0,2604.0,733.0
3,125,9781427197559,Drake,"Lynn,Peppas",Superstars!,2011-08-15,World,,False,,...,844.0,1204.0,734.0,320.0,697.0,92.0,42.61,1242214.0,50670.0,24436.0
4,243,9781427166807,Golf in Action,"Hannelore,Sotzek",Sports in Action,2000-10-31,World,,False,,...,24.0,19.0,15.0,6.0,14.0,82.0,35.29,1113.0,928.0,311.0


In [4]:
from src.components.data_transformation import book_data_transformation,user_data_transformation

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
item_df,  book_feature_count, emb_count =  book_data_transformation(book_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_df['long_description'].fillna('unk',inplace=True)
  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:07<00:00, 52.29it/s]
  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:36<00:00, 10.41it/s]
  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:32<00:00, 11.79it/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inp

Dictionary saved and merged into feature_mappings/theme_to_idx.json
Dictionary saved and merged into feature_mappings/category_to_idx.json
Dictionary saved and merged into feature_mappings/reading_skill_to_idx.json
Dictionary saved and merged into feature_mappings/grades_to_idx.json
Dictionary saved and merged into feature_mappings/book_code_to_idx.json


In [57]:
class BookDataset(Dataset):
    def __init__(self,  book_features_df, book_feature_cols):
        """
        interactions_df: includes user_id, book_code, label, and interaction-level features
        book_features_df: indexed by book_code, contains theme_ids, category_ids, and other features
        book_feature_cols: list of book feature column names
        interaction_feature_cols: list of interaction-level feature column names
        """
        
        self.book_features_df = book_features_df
        self.book_feature_cols = book_feature_cols
       

    def __len__(self):
        return len(self.book_features_df)

    def __getitem__(self, idx):
        row = self.book_features_df.iloc[idx]
        book_code = row["book_code"]

        # --- 1. Get book-level features ---
        # book_info = self.book_features_df.loc[book_code]

        theme_ids = row["theme_ids"]  # already list[int]
        category_ids = row["category_ids"]  # already list[int]
        reading_skill_ids = row["reading_skill_ids"]  # already list[int]
        grades_ids = row['grades_ids']  # already list[int]
        book_code_ids= row['book_code_ids']  # already list[int]


        # 'countries_ids','states_ids','zipcode_ids','teacher_ids','school_ids'
        

        book_features = np.array(row[self.book_feature_cols], dtype=np.float32)

        # --- 2. Get interaction-level features ---
        # user_features = np.array(row[self.interaction_feature_cols], dtype=np.float32)

        # # --- 3. Merge into one "other_features" vector ---
        # other_features = torch.tensor(
        #     np.concatenate([book_features, interaction_features]),
        #     dtype=torch.float32
        # )

        return {
            "book_code": book_code,
            "theme_ids": torch.tensor(theme_ids, dtype=torch.long),
            "category_ids": torch.tensor(category_ids, dtype=torch.long),
            "reading_skill_ids":torch.tensor(reading_skill_ids, dtype=torch.long),
            "grades_ids" : torch.tensor(grades_ids , dtype=torch.long) ,
            "book_code_ids": torch.tensor(book_code_ids , dtype=torch.long),
            "book_features": torch.tensor(book_features, dtype=torch.float32)
           
        }


def collate_bookonly_fn(batch):
    # --------- Helper to pad & mask any list-of-tensors field ----------
    def pad_and_mask(key):
        seqs = [torch.as_tensor(item[key], dtype=torch.long) for item in batch]
        
        padded = pad_sequence(seqs, batch_first=True, padding_value=0)  # [B, max_len]
        mask = (padded != 0).long()  # [B, max_len] boolean mask
        return padded, mask

    # Book-level multi-ID fields
    theme_ids, theme_mask = pad_and_mask("theme_ids")
    category_ids, category_mask = pad_and_mask("category_ids")
    reading_skill_ids, reading_skill_mask = pad_and_mask("reading_skill_ids")
    grades_ids, grades_mask = pad_and_mask("grades_ids")
    book_code_ids, book_code_mask = pad_and_mask("book_code_ids")


    # Scalar / dense features
    book_features = torch.stack([torch.as_tensor(item["book_features"], dtype=torch.float32) for item in batch])

    book_codes = [item["book_code"] for item in batch]
    return {
        # --- Book-level IDs ---
        "book_code": book_codes,
        "theme_ids": theme_ids, "theme_mask": theme_mask,
        "category_ids": category_ids, "category_mask": category_mask,
        "reading_skill_ids": reading_skill_ids, "reading_skill_mask": reading_skill_mask,
        "grades_ids": grades_ids, "grades_mask": grades_mask,
        "book_code_ids": book_code_ids, "book_code_mask": book_code_mask,

        # --- Dense features & labels ---
       
        "book_features": book_features,
       
    }


In [58]:
columns_author_title =[f"emb_title_author_{i}" for i in range(384)]
columns_long_description = [f"emb_desc_{i}" for i in range(384)]
columns_book_series = [f"emb_book_series_{i}" for i in range(384)]
columns_add = ['readable_page_count','book_type_binary', 'fn_Fiction', 'fn_Non-Fiction', 'fn_unk',
       'lang_English', 'lang_French', 'lang_Haitian French Creole',
       'lang_Mandarin', 'lang_Portuguese', 'lang_Spanish']

columns_learn_emb = ['book_code','book_code_ids','grades_ids','reading_skill_ids', 'category_ids','theme_ids']

book_feature_cols = columns_author_title + columns_long_description + columns_book_series + columns_add

In [59]:
from torch.utils.data import DataLoader

dataset = BookDataset( item_df, book_feature_cols)
trainloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_bookonly_fn)

# dataset = BookInteractionDataset( val[:5], item_df, book_feature_cols, interaction_feature_cols)
# valloader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=book_collate_fn)

In [61]:
for i in trainloader:
    print(i)
    break

{'book_code': ['9781645271222', '9781641289337'], 'theme_ids': tensor([[98,  0],
        [29, 21]]), 'theme_mask': tensor([[1, 0],
        [1, 1]]), 'category_ids': tensor([[7],
        [1]]), 'category_mask': tensor([[1],
        [1]]), 'reading_skill_ids': tensor([], size=(2, 0), dtype=torch.int64), 'reading_skill_mask': tensor([], size=(2, 0), dtype=torch.int64), 'grades_ids': tensor([[1, 2, 3, 4],
        [9, 8, 0, 0]]), 'grades_mask': tensor([[1, 1, 1, 1],
        [1, 1, 0, 0]]), 'book_code_ids': tensor([[ 557],
        [8789]]), 'book_code_mask': tensor([[1],
        [1]]), 'book_features': tensor([[-0.0324,  0.0229,  0.1106,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0175,  0.0198, -0.0207,  ...,  0.0000,  0.0000,  0.0000]])}


In [50]:
i.keys()

dict_keys(['book_code', 'theme_ids', 'theme_mask', 'category_ids', 'category_mask', 'reading_skill_ids', 'reading_skill_mask', 'grades_ids', 'grades_mask', 'book_code_ids', 'book_code_mask', 'book_features'])

In [None]:
from src.utils.main_utils import save_dict_to_json,  load_json_file
fi = load_json_file('model_parmater.json')

In [52]:
fi

{'book_feature_count': {'themes_count': 108,
  'book_count': 12190,
  'grade_count': 11,
  'reading_skills_count': 13,
  'category_count': 9},
 'emb_count': {'themes_count': 8,
  'book_count': 16,
  'grade_count': 4,
  'reading_skills_count': 4,
  'category_count': 4},
 'user_feature_count': {'themes_count': 108,
  'book_count': 12190,
  'reading_skills_count': 13,
  'category_count': 9,
  'country_count': 160,
  'state_count': 558,
  'zipcode_count': 15985,
  'teacher_count': 112147,
  'school_count': 84655},
 'user_emb_count': {'themes_count': 8,
  'book_count': 16,
  'reading_skills_count': 4,
  'category_count': 4,
  'country_count': 8,
  'state_count': 10,
  'zipcode_count': 14,
  'teacher_count': 16,
  'school_count': 16},
 'book_feature_dim': 1163,
 'user_feature_dim': 20}

In [54]:
from src.components.model import TwoTowerModel
model = TwoTowerModel(fi['book_feature_count'], fi['user_feature_count'],
                 fi['emb_count'], fi['user_emb_count'],
                  book_feature_dim=fi['book_feature_dim'], user_feature_dim=fi['user_feature_dim'])

In [56]:
import torch

# Recreate the model with the same architecture
# model = TwoTowerModel(
#     book_feature_count=...,   # same values used in training
#     user_feature_count=...,
#     emb_count=...,
#     user_emb_count=...,
#     book_feature_dim=...,
#     user_feature_dim=...
# )

# Move to device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Load checkpoint weights
checkpoint_path = "checkpoints/two_tower_best.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)

model.load_state_dict(checkpoint)
model.eval()   # set to inference mode
print("✅ TwoTowerModel checkpoint loaded successfully")


✅ TwoTowerModel checkpoint loaded successfully


In [62]:
model.get_book_vec( i)

tensor([[ 0.0345, -0.0031, -0.0741,  0.0136,  0.0602,  0.0313, -0.0285,  0.0005,
          0.0346,  0.1360, -0.0284, -0.0429,  0.1038,  0.0773, -0.0959,  0.0612,
         -0.1399, -0.0237,  0.0811,  0.0789, -0.0233, -0.2071,  0.1142, -0.0883,
         -0.0120, -0.0794,  0.0558,  0.0983,  0.0328,  0.1238, -0.0798,  0.0447,
          0.1192,  0.0621,  0.0233, -0.0061, -0.0913,  0.0908, -0.0437,  0.1269,
         -0.0089, -0.0619,  0.1195, -0.0511, -0.0358,  0.0556,  0.0120,  0.1017,
         -0.0559,  0.1378,  0.1520,  0.0994,  0.0370, -0.0460, -0.1077,  0.0527,
          0.0162, -0.1324,  0.0456, -0.1308, -0.0436, -0.0608,  0.0868,  0.0588],
        [-0.0273,  0.0073, -0.1011,  0.0659,  0.0428,  0.0323, -0.0554, -0.0478,
          0.0038,  0.0387, -0.0406, -0.0568, -0.0322,  0.1490, -0.0302,  0.0435,
         -0.1402, -0.0376,  0.1389,  0.0846, -0.0247, -0.1431,  0.0434, -0.0463,
         -0.0586, -0.0922, -0.0261,  0.0035, -0.0694,  0.1137, -0.1025,  0.1026,
          0.1599,  0.2153, 

In [72]:
item_df.shape

(12189, 1169)

In [None]:
from torch.utils.data import DataLoader
from src.utils.main_utils import load_json_file
from src.components.model import TwoTowerModel
from tqdm import tqdm 
import faiss
import pickle

def build_and_save_book_embeddings_faiss(model, item_df, book_feature_cols,
                                         batch_size=512, device="cuda" if torch.cuda.is_available() else "cpu",
                                         index_path="book_index.faiss",
                                         id_map_path="book_ids.pkl"):

    fi = load_json_file('model_parmater.json')

    dataset = BookDataset( item_df, book_feature_cols)
    trainloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_bookonly_fn)


    model = TwoTowerModel(fi['book_feature_count'], fi['user_feature_count'],
                    fi['emb_count'], fi['user_emb_count'],
                    book_feature_dim=fi['book_feature_dim'], user_feature_dim=fi['user_feature_dim'])
    model.eval()
    model = model.to(device)

    all_embeddings = []
    book_ids = []

    with torch.no_grad():
            for batch in tqdm(trainloader, desc="Encoding books"):
                batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
                
                # Compute embeddings
                book_vec = model.get_book_vec(batch)

                all_embeddings.append(book_vec.cpu().numpy())
                book_ids.extend(batch["book_code"])  # keep mapping book_id → index

    all_embeddings = np.vstack(all_embeddings).astype("float32")  # [N_books, D]

    dim = all_embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)   # Inner Product (dot product)
    index.add(all_embeddings)

    # Save index & mapping
    faiss.write_index(index, index_path)
    with open(id_map_path, "wb") as f:
        pickle.dump(book_ids, f)

    print(f"✅ Saved FAISS index with {len(book_ids)} books at {index_path}")
    return index, book_ids

Encoding books: 100%|██████████| 96/96 [00:07<00:00, 13.39it/s]


In [71]:
all_embeddings.shape

(12189, 64)

In [None]:
# import faiss
# import numpy as np
# from torch.utils.data import DataLoader
# import torch
# from tqdm import tqdm
# import pickle

# def build_and_save_book_embeddings_faiss(model, book_df, book_feature_cols,
#                                          batch_size=512, device="cuda",
#                                          index_path="book_index.faiss",
#                                          id_map_path="book_ids.pkl"):
#     """
#     Precompute book embeddings and save them in a FAISS index for fast retrieval.
#     """
#     # Dataset + DataLoader
#     book_dataset = BookDataset(book_df, book_feature_cols)
#     book_loader = DataLoader(book_dataset,
#                              batch_size=batch_size,
#                              shuffle=False,
#                              collate_fn=collate_bookonly_fn)

#     model.eval()
#     model = model.to(device)

#     all_embeddings = []
#     book_ids = []

#     with torch.no_grad():
#         for batch in tqdm(book_loader, desc="Encoding books"):
#             batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
            
#             # Compute embeddings
#             book_vec = model.book_tower(
#                 batch["theme_ids"], batch["theme_mask"],
#                 batch["category_ids"], batch["category_mask"],
#                 batch["reading_skill_ids"], batch["reading_skill_mask"],
#                 batch["grades_ids"], batch["grades_mask"],
#                 batch["book_code_ids"], batch["book_code_mask"],
#                 batch["book_features"]
#             )   # [B, D]

#             all_embeddings.append(book_vec.cpu().numpy())
#             book_ids.extend(batch["book_code"])  # keep mapping book_id → index

#     all_embeddings = np.vstack(all_embeddings).astype("float32")  # [N_books, D]

#     # --- FAISS Index ---
#     dim = all_embeddings.shape[1]
#     index = faiss.IndexFlatIP(dim)   # Inner Product (dot product)
#     index.add(all_embeddings)

#     # Save index & mapping
#     faiss.write_index(index, index_path)
#     with open(id_map_path, "wb") as f:
#         pickle.dump(book_ids, f)

#     print(f"✅ Saved FAISS index with {len(book_ids)} books at {index_path}")
#     return index, book_ids


In [73]:
platfrom_query = 'sql_files/user_book_platform.sql'
user_platform = connection.redshift_query_fetching_as_df(platfrom_query)
location_query = 'sql_files/user_location.sql'
user_loc = connection.redshift_query_fetching_as_df(location_query)
user_query = 'sql_files/user_query.sql'
user_df = connection.redshift_query_fetching_as_df(user_query)

2025-08-18 17:46:22,146 - side - DEBUG - Attempting to read SQL file: sql_files/user_book_platform.sql
2025-08-18 17:46:22,152 - side - INFO - Successfully read SQL file: sql_files/user_book_platform.sql
2025-08-18 17:46:24,706 - read_shift - INFO - Connected to Redshift successfully.
  df = pd.read_sql_query(query, conn)
2025-08-18 17:58:27,200 - read_shift - INFO - Query executed successfully, retrieved 4973201 rows.
2025-08-18 17:58:27,204 - read_shift - INFO - Connection closed.
2025-08-18 17:58:27,205 - side - DEBUG - Attempting to read SQL file: sql_files/user_location.sql
2025-08-18 17:58:27,207 - side - INFO - Successfully read SQL file: sql_files/user_location.sql
2025-08-18 17:58:30,104 - read_shift - INFO - Connected to Redshift successfully.
2025-08-18 18:00:04,246 - read_shift - INFO - Query executed successfully, retrieved 658152 rows.
2025-08-18 18:00:04,248 - read_shift - INFO - Connection closed.
2025-08-18 18:00:04,249 - side - DEBUG - Attempting to read SQL file: sql

In [74]:
interaction_feature_cols = ['cumulative_web_during_school_hour',
 'cumulative_web_after_school_hour',
 'cumulative_apple_during_school_hour',
 'cumulative_apple_after_school_hour',
 'cumulative_android_during_school_hour',
 'cumulative_android_after_school_hour',
 'cumulative_unk_during_school_hour',
 'cumulative_unk_after_school_hour',
 'grade_grade 1',
 'grade_grade 2',
 'grade_grade 3',
 'grade_grade 4',
 'grade_grade 5',
 'grade_kindergarten',
 'class_activation_bucket_AC',
 'class_activation_bucket_AC0',
 'class_activation_bucket_AC1',
 'class_activation_bucket_AC2',
 'class_activation_bucket_AC3',
 'class_activation_bucket_unk',]

In [75]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torch
import numpy as np

class UserDataset(Dataset):
    def __init__(self, interactions_df, interaction_feature_cols):
        """
        interactions_df: includes user_id, book_code, label, and interaction-level features
        book_features_df: indexed by book_code, contains theme_ids, category_ids, and other features
        book_feature_cols: list of book feature column names
        interaction_feature_cols: list of interaction-level feature column names
        """
        self.interactions_df = interactions_df.reset_index(drop=True)
        # self.book_features_df = book_features_df.set_index("book_code")
        # self.book_feature_cols = book_feature_cols
        self.interaction_feature_cols = interaction_feature_cols

    def __len__(self):
        return len(self.interactions_df)

    def __getitem__(self, idx):
        row = self.interactions_df.iloc[idx]
        # book_code = row["book_code"]

        # --- 1. Get book-level features ---
        # book_info = self.book_features_df.loc[book_code]

        # theme_ids = book_info["theme_ids"]  # already list[int]
        # category_ids = book_info["category_ids"]  # already list[int]
        # reading_skill_ids = book_info["reading_skill_ids"]  # already list[int]
        # grades_ids = book_info['grades_ids']  # already list[int]
        # book_code_ids= book_info['book_code_ids']  # already list[int]

        last_book_ids = row['book_code_ids']
        last_theme_ids = row['theme_ids']
        last_category_ids = row['category_ids']
        last_reading_skills_id = row['reading_skill_ids']

        countries_ids = row['countries_ids']
        states_ids = row['states_ids']
        zipcode_ids = row['zipcode_ids']
        teacher_code_ids = row['teacher_code_ids']
        school_code_ids = row['school_code_ids']

        # 'countries_ids','states_ids','zipcode_ids','teacher_ids','school_ids'
        

        # book_features = np.array(book_info[self.book_feature_cols], dtype=np.float32)

        # --- 2. Get interaction-level features ---
        user_features = np.array(row[self.interaction_feature_cols], dtype=np.float32)

        # # --- 3. Merge into one "other_features" vector ---
        # other_features = torch.tensor(
        #     np.concatenate([book_features, interaction_features]),
        #     dtype=torch.float32
        # )

        return {
            # "book_code": book_code,
            # "theme_ids": torch.tensor(theme_ids, dtype=torch.long),
            # "category_ids": torch.tensor(category_ids, dtype=torch.long),
            # "reading_skill_ids":torch.tensor(reading_skill_ids, dtype=torch.long),
            # "grades_ids" : torch.tensor(grades_ids , dtype=torch.long) ,
            # "book_code_ids": torch.tensor(book_code_ids , dtype=torch.long),

            "last_book_ids" : torch.tensor(last_book_ids , dtype=torch.long),
            "last_theme_ids" : torch.tensor(last_theme_ids , dtype=torch.long),
            "last_category_ids" : torch.tensor(last_category_ids , dtype=torch.long),
            "last_reading_skills_id" : torch.tensor(last_reading_skills_id , dtype=torch.long),

            "countries_ids" : torch.tensor(countries_ids , dtype=torch.long),
            "states_ids" : torch.tensor(states_ids, dtype=torch.long),
            "zipcode_ids" : torch.tensor(zipcode_ids , dtype=torch.long),
            "teacher_code_ids" : torch.tensor(teacher_code_ids , dtype=torch.long),
            "school_code_ids" : torch.tensor(school_code_ids , dtype=torch.long),

            # "book_features": torch.tensor(book_features, dtype=torch.float32),
            "user_features": torch.tensor(user_features, dtype=torch.float32),
            # "label": torch.tensor(row["label"], dtype=torch.float32)
        }


def collate_useronly_fn(batch):
    # --------- Helper to pad & mask any list-of-tensors field ----------
    def pad_and_mask(key):
        seqs = [torch.as_tensor(item[key], dtype=torch.long) for item in batch]
        
        padded = pad_sequence(seqs, batch_first=True, padding_value=0)  # [B, max_len]
        mask = (padded != 0).long()  # [B, max_len] boolean mask
        return padded, mask

    # Book-level multi-ID fields
    # theme_ids, theme_mask = pad_and_mask("theme_ids")
    # category_ids, category_mask = pad_and_mask("category_ids")
    # reading_skill_ids, reading_skill_mask = pad_and_mask("reading_skill_ids")
    # grades_ids, grades_mask = pad_and_mask("grades_ids")
    # book_code_ids, book_code_mask = pad_and_mask("book_code_ids")

    # Interaction-level multi-ID fields
    last_book_ids, last_book_mask = pad_and_mask("last_book_ids")
    last_theme_ids, last_theme_mask = pad_and_mask("last_theme_ids")
    last_category_ids, last_category_mask = pad_and_mask("last_category_ids")
    last_reading_skills_id, last_reading_skills_mask = pad_and_mask("last_reading_skills_id")

    

    countries_ids, countries_mask = pad_and_mask("countries_ids")
    states_ids, states_mask = pad_and_mask("states_ids")
    zipcode_ids, zipcode_mask = pad_and_mask( "zipcode_ids")
    teacher_ids, teacher_mask = pad_and_mask("teacher_code_ids")
    school_ids, school_mask = pad_and_mask("school_code_ids")

    # Scalar / dense features
    # book_features = torch.stack([torch.as_tensor(item["book_features"], dtype=torch.float32) for item in batch])
    user_features = torch.stack([torch.as_tensor(item["user_features"], dtype=torch.float32) for item in batch])
    # labels = torch.stack([torch.as_tensor(item["label"], dtype=torch.float32) for item in batch])

    return {
        # --- Book-level IDs ---
        # "theme_ids": theme_ids, "theme_mask": theme_mask,
        # "category_ids": category_ids, "category_mask": category_mask,
        # "reading_skill_ids": reading_skill_ids, "reading_skill_mask": reading_skill_mask,
        # "grades_ids": grades_ids, "grades_mask": grades_mask,
        # "book_code_ids": book_code_ids, "book_code_mask": book_code_mask,

        # --- Interaction-level IDs ---
        "last_book_ids": last_book_ids, "last_book_mask": last_book_mask,
        "last_theme_ids": last_theme_ids, "last_theme_mask": last_theme_mask,
        "last_category_ids": last_category_ids, "last_category_mask": last_category_mask,
        "last_reading_skills_id": last_reading_skills_id, "last_reading_skills_mask": last_reading_skills_mask,

        "countries_ids": countries_ids, "countries_mask": countries_mask,
        "states_ids": states_ids, "states_mask": states_mask,
        "zipcode_ids": zipcode_ids, "zipcode_mask": zipcode_mask,
        "teacher_ids": teacher_ids, "teacher_mask": teacher_mask,
        "school_ids": school_ids, "school_mask": school_mask,

        # --- Dense features & labels ---
       
        # "book_features": book_features,
        "user_features": user_features,
        # "labels": labels
    }

In [76]:
from src.components.data_transformation import book_data_transformation,user_data_transformation

In [77]:
child_df,user_feature_count, user_emb_count= user_data_transformation(user_df,user_loc,user_platform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_platform_final['book_code'] = user_platform_final['book_code'].astype('str')


Dictionary saved and merged into feature_mappings/country_to_idx.json
Dictionary saved and merged into feature_mappings/state_to_idx.json
Dictionary saved and merged into feature_mappings/zipcode_to_idx.json
Dictionary saved and merged into feature_mappings/teacher_to_idx.json
Dictionary saved and merged into feature_mappings/school_to_idx.json


In [79]:
from torch.utils.data import DataLoader

dataset = UserDataset( child_df[:1000], interaction_feature_cols)
userloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_useronly_fn)

# dataset = BookInteractionDataset( val[:5], item_df, book_feature_cols, interaction_feature_cols)
# valloader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=book_collate_fn)

In [80]:

for i in userloader:
    book_vec = model.get_user_vec(i)
    break

In [82]:
book_vec.shape

torch.Size([1, 64])