In [1]:
from src.cloud_storage.redshift_connection import redshift_connection

connection = redshift_connection()

#  BOOK DATA PREPARATION

In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

def encode_column_with_sentence_transformer(df: pd.DataFrame, column: str, model_name: str = 'all-MiniLM-L6-v2') -> np.ndarray:
    """
    Encodes a column of text into embeddings using a sentence-transformer model.

    Args:
        df (pd.DataFrame): Input dataframe.
        column (str): Column name to encode.
        model_name (str): Pretrained sentence-transformers model name.

    Returns:
        np.ndarray: Array of shape (num_rows, embedding_dim)
    """
    model = SentenceTransformer(model_name)
    
    # Fill missing values
    texts = df[column].fillna("unk").astype(str).tolist()
    
    # Encode with model
    embeddings = model.encode(texts, show_progress_bar=True)
    
    return np.array(embeddings)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
item_query = 'sql_files/item_query.sql'
book_df = connection.redshift_query_fetching_as_df(item_query)
book_df.head()

2025-08-10 16:54:09,432 - side - DEBUG - Attempting to read SQL file: sql_files/item_query.sql
2025-08-10 16:54:09,434 - side - INFO - Successfully read SQL file: sql_files/item_query.sql
2025-08-10 16:54:12,068 - read_shift - INFO - Connected to Redshift successfully.
  df = pd.read_sql_query(query, conn)
2025-08-10 16:55:38,449 - read_shift - INFO - Query executed successfully, retrieved 12189 rows.
2025-08-10 16:55:38,449 - read_shift - INFO - Connection closed.


Unnamed: 0,id,book_isbn,book_title,authors,book_series,publication_date,rights,illustrators,interactive,search_keywords,...,clicks_students,quality_clicks,quality_clicks_students,students_completed_book,students_completed_75_per_book,per_75_completed_unique_books,completion_rate,time_spent,total_pages,read_pages
0,18,9781427162748,A Slave Family,"Bobbie,Kalman",Colonial People,2002-10-31,World,,False,,...,444.0,415.0,337.0,84.0,305.0,86.0,23.86,123885.0,18432.0,5846.0
1,40,9780778798057,Apache Helicopter Pilots,"Antony,Loveless",The World's Most Dangerous Jobs,2009-08-01,World,,False,,...,120.0,143.0,107.0,62.0,104.0,95.0,56.88,104414.0,6014.0,3703.0
2,97,9781427165275,"China - the land (revised, ed. 3)","Bobbie,Kalman","Lands, Peoples, and Cultures",2008-02-15,World,,False,,...,88.0,85.0,76.0,29.0,68.0,86.0,36.71,3390.0,3503.0,1535.0
3,128,9781427120335,"Dream Jobs in Transportation, Distribution and...","Cynthia,O‘Brien",Cutting-Edge Careers in Technical Education,2018-02-28,World,,False,"Careers and occupations,Transportation,Include...",...,44.0,34.0,24.0,12.0,24.0,88.0,44.44,2639.0,1680.0,773.0
4,199,9781427197719,Fishing in Action,"Hadley,Dyer",Sports in Action,2005-10-31,World,,False,,...,91.0,94.0,75.0,32.0,71.0,93.0,42.11,6364.0,3936.0,2023.0


In [4]:
book_df.book_isbn.value_counts()

book_isbn
9781427162748    1
9781624967931    1
9781534137622    1
9781534138032    1
9781534155442    1
                ..
9781039655720    1
9781532415883    1
9781427118516    1
9781427180964    1
9781627530712    1
Name: count, Length: 12189, dtype: int64

In [5]:
book_df.columns

Index(['id', 'book_isbn', 'book_title', 'authors', 'book_series',
       'publication_date', 'rights', 'illustrators', 'interactive',
       'search_keywords', 'top_hundred', 'book_type', 'long_description',
       'bestseller', 'editor_recommended', 'animated', 'top_twenty',
       'top_fifty', 'page_count', 'min_grade', 'max_grade',
       'readable_page_count', 'min_reading_age', 'max_reading_age',
       'read_along_audio', 'read_along_with_highlighting', 'orientation',
       'last_reading_page_number', 'book_format', 'language_book',
       'publisher_name', 'fiction_nonfiction', 'reading_skill_name',
       'theme_name', 'category_name', 'book_code', 'grade_name', 'book_code',
       'clicks', 'clicks_students', 'quality_clicks',
       'quality_clicks_students', 'students_completed_book',
       'students_completed_75_per_book', 'per_75_completed_unique_books',
       'completion_rate', 'time_spent', 'total_pages', 'read_pages'],
      dtype='object')

In [6]:
book_df[book_df.category_name.isna()][['category_name','book_isbn']]

Unnamed: 0,category_name,book_isbn
45,,9781634402699
179,,9781645805489
512,,9798887358390
712,,9798890422088
1040,,9781634402729
2762,,9781634402712
2844,,9781039836181
3055,,9781532404320
3477,,9781039836198
3781,,9781620024836


In [7]:
book_df[book_df.book_isbn == 9781634402699]

Unnamed: 0,id,book_isbn,book_title,authors,book_series,publication_date,rights,illustrators,interactive,search_keywords,...,clicks_students,quality_clicks,quality_clicks_students,students_completed_book,students_completed_75_per_book,per_75_completed_unique_books,completion_rate,time_spent,total_pages,read_pages


In [8]:
book_df['title_plus_author'] = book_df.apply(lambda x:x['book_title'].lower()+' by '+x['authors'].lower(),axis=1)
book_df['long_description'].fillna('unk',inplace=True)
book_df['long_description'] = book_df.apply(lambda x:x['long_description'].lower(),axis=1)
columns = ['book_isbn', 'title_plus_author', 'book_series', 'book_type', 'long_description','min_grade', 'max_grade',
       'readable_page_count','fiction_nonfiction', 'reading_skill_name','theme_name', 'category_name','language_book']

book_df_final = book_df[columns]
book_df_final.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_df['long_description'].fillna('unk',inplace=True)


Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,fiction_nonfiction,reading_skill_name,theme_name,category_name,language_book
0,9781427162748,"a slave family by bobbie,kalman",Colonial People,PDF,a slave family helps bring to life the many st...,3,5,32,Non-Fiction,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up",English
1,9780778798057,"apache helicopter pilots by antony,loveless",The World's Most Dangerous Jobs,PDF,these army pilots fly the most demanding helic...,3,5,31,Non-Fiction,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up,English
2,9781427165275,"china - the land (revised, ed. 3) by bobbie,ka...","Lands, Peoples, and Cultures",PDF,this revised edition takes a new look at this ...,3,5,31,Non-Fiction,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature",English
3,9781427120335,"dream jobs in transportation, distribution and...",Cutting-Edge Careers in Technical Education,PDF,qualified workers are required every day in th...,3,5,30,Non-Fiction,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature",English
4,9781427197719,"fishing in action by hadley,dyer",Sports in Action,PDF,fishing in action provides kids with easy-to-f...,3,5,32,Non-Fiction,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up,English


In [9]:
emb = encode_column_with_sentence_transformer(book_df_final,'title_plus_author')
# Convert embeddings to DataFrame
emb_df = pd.DataFrame(emb, columns=[f"emb_title_author_{i}" for i in range(emb.shape[1])])

# Combine with book_id
book_embedding_author_df = pd.concat([book_df_final, emb_df], axis=1)

# Save to file
# book_embedding_df.to_parquet("book_embeddings.parquet", index=False)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:07<00:00, 49.80it/s]


In [10]:
book_embedding_author_df.shape

(12189, 397)

In [11]:
emb_desc = encode_column_with_sentence_transformer(book_embedding_author_df,'long_description')
# Convert embeddings to DataFrame
emb_desc_df = pd.DataFrame(emb_desc, columns=[f"emb_desc_{i}" for i in range(emb_desc.shape[1])])

# Combine with book_id
long_description_df = pd.concat([book_embedding_author_df, emb_desc_df], axis=1)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:36<00:00, 10.33it/s]


In [12]:
emb_book_series = encode_column_with_sentence_transformer(long_description_df,'long_description')
# Convert embeddings to DataFrame
emb_book_series_df = pd.DataFrame(emb_book_series, columns=[f"emb_book_series_{i}" for i in range(emb_book_series.shape[1])])

# Combine with book_id

book_series_df = pd.concat([long_description_df,emb_book_series_df ], axis=1)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 381/381 [00:30<00:00, 12.53it/s]


In [13]:
book_df_final['readable_page_count'] = np.clip(book_df_final['readable_page_count'],0,50)/50
book_series_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df_final['readable_page_count'] = np.clip(book_df_final['readable_page_count'],0,50)/50


(12189, 1165)

In [14]:
# [ 'book_type','min_grade', 'max_grade','readable_page_count','fiction_nonfiction', 'reading_skill_name','theme_name', 'category_name','language_book']

columns_author_title =[f"emb_title_author_{i}" for i in range(emb.shape[1])]
columns_long_description = [f"emb_desc_{i}" for i in range(emb_desc.shape[1])]
columns_book_series = [f"emb_book_series_{i}" for i in range(emb_book_series.shape[1])]
columns_add = ['readable_page_count','book_type_binary', 'fn_Fiction', 'fn_Non-Fiction', 'fn_unk',
       'lang_English', 'lang_French', 'lang_Haitian French Creole',
       'lang_Mandarin', 'lang_Portuguese', 'lang_Spanish']

columns_learn_emb = [ 'book_isbn','grades','reading_skill_name','theme_name', 'category_name',]

final_columns = columns_author_title + columns_long_description + columns_book_series + columns_add + columns_learn_emb

len(final_columns)

1168

In [15]:
book_series_df['book_type_binary'] = np.where(book_series_df.book_type == 'PDF',1,0)
book_series_df['fiction_nonfiction'].fillna('unk',inplace =True)
book_df_final_v1 = pd.get_dummies(book_series_df, columns=['fiction_nonfiction'], prefix='fn')
book_df_final_v1 = pd.get_dummies(book_df_final_v1, columns=['language_book'], prefix='lang')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_series_df['fiction_nonfiction'].fillna('unk',inplace =True)


In [16]:
book_df_final_v1.columns

Index(['book_isbn', 'title_plus_author', 'book_series', 'book_type',
       'long_description', 'min_grade', 'max_grade', 'readable_page_count',
       'reading_skill_name', 'theme_name',
       ...
       'book_type_binary', 'fn_Fiction', 'fn_Non-Fiction', 'fn_unk',
       'lang_English', 'lang_French', 'lang_Haitian French Creole',
       'lang_Mandarin', 'lang_Portuguese', 'lang_Spanish'],
      dtype='object', length=1173)

In [17]:
book_df_final_v1[[ 'min_grade', 'max_grade', 'reading_skill_name','theme_name', 'category_name',]].head()

Unnamed: 0,min_grade,max_grade,reading_skill_name,theme_name,category_name
0,3,5,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up"
1,3,5,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up
2,3,5,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature"
3,3,5,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature"
4,3,5,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up


In [18]:
grade_list = ['pk', 'k', '1', '2', '3', '4', '5', '6', '7', '8']
grade_to_idx = {g: i for i, g in enumerate(grade_list)}
def get_range(min_g, max_g):
    start_idx = grade_to_idx[min_g]
    end_idx = grade_to_idx[max_g]
    return ','.join(grade_list[start_idx:end_idx + 1])
x = get_range(min_g = 'pk', max_g ='3')


In [19]:
book_df_final_v1["grades"] = book_df_final_v1.apply(
    lambda row: get_range(row["min_grade"], row["max_grade"]),
    axis=1
)

In [20]:
book_df_final_v1.head()

Unnamed: 0,book_isbn,title_plus_author,book_series,book_type,long_description,min_grade,max_grade,readable_page_count,reading_skill_name,theme_name,...,fn_Fiction,fn_Non-Fiction,fn_unk,lang_English,lang_French,lang_Haitian French Creole,lang_Mandarin,lang_Portuguese,lang_Spanish,grades
0,9781427162748,"a slave family by bobbie,kalman",Colonial People,PDF,a slave family helps bring to life the many st...,3,5,32,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...",...,False,True,False,True,False,False,False,False,False,345
1,9780778798057,"apache helicopter pilots by antony,loveless",The World's Most Dangerous Jobs,PDF,these army pilots fly the most demanding helic...,3,5,31,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",...,False,True,False,True,False,False,False,False,False,345
2,9781427165275,"china - the land (revised, ed. 3) by bobbie,ka...","Lands, Peoples, and Cultures",PDF,this revised edition takes a new look at this ...,3,5,31,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History",...,False,True,False,True,False,False,False,False,False,345
3,9781427120335,"dream jobs in transportation, distribution and...",Cutting-Edge Careers in Technical Education,PDF,qualified workers are required every day in th...,3,5,30,"Making Inferences, Illustrations or other Visu...","Occupations, Technology",...,False,True,False,True,False,False,False,False,False,345
4,9781427197719,"fishing in action by hadley,dyer",Sports in Action,PDF,fishing in action provides kids with easy-to-f...,3,5,32,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",...,False,True,False,True,False,False,False,False,False,345


In [21]:
book_df_final_v1[[ 'grades','reading_skill_name','theme_name', 'category_name',]].head()

Unnamed: 0,grades,reading_skill_name,theme_name,category_name
0,345,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up"
1,345,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up
2,345,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature"
3,345,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature"
4,345,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up


In [22]:
book_df_data =  book_df_final_v1[final_columns]
print(book_df_data.shape)
book_df_data.columns

(12189, 1168)


Index(['emb_title_author_0', 'emb_title_author_1', 'emb_title_author_2',
       'emb_title_author_3', 'emb_title_author_4', 'emb_title_author_5',
       'emb_title_author_6', 'emb_title_author_7', 'emb_title_author_8',
       'emb_title_author_9',
       ...
       'lang_French', 'lang_Haitian French Creole', 'lang_Mandarin',
       'lang_Portuguese', 'lang_Spanish', 'book_isbn', 'grades',
       'reading_skill_name', 'theme_name', 'category_name'],
      dtype='object', length=1168)

In [23]:
book_df_data.to_csv('final_book_level_data.csv', index=False)

In [24]:
books = pd.read_csv('final_book_level_data.csv')
books.columns

  books = pd.read_csv('final_book_level_data.csv')


Index(['emb_title_author_0', 'emb_title_author_1', 'emb_title_author_2',
       'emb_title_author_3', 'emb_title_author_4', 'emb_title_author_5',
       'emb_title_author_6', 'emb_title_author_7', 'emb_title_author_8',
       'emb_title_author_9',
       ...
       'lang_French', 'lang_Haitian French Creole', 'lang_Mandarin',
       'lang_Portuguese', 'lang_Spanish', 'book_isbn', 'grades',
       'reading_skill_name', 'theme_name', 'category_name'],
      dtype='object', length=1168)

# USER DATA PREPARATION

In [152]:
user_query = 'sql_files/user_query.sql'
user_df = connection.redshift_query_fetching_as_df(user_query)

2025-08-08 17:25:03,990 - side - DEBUG - Attempting to read SQL file: sql_files/user_query.sql
2025-08-08 17:25:03,992 - side - INFO - Successfully read SQL file: sql_files/user_query.sql
2025-08-08 17:25:07,379 - read_shift - INFO - Connected to Redshift successfully.
  df = pd.read_sql_query(query, conn)
2025-08-08 17:30:25,871 - read_shift - INFO - Query executed successfully, retrieved 4953951 rows.
2025-08-08 17:30:25,875 - read_shift - INFO - Connection closed.


In [153]:
location_query = 'sql_files/user_location.sql'
user_loc = connection.redshift_query_fetching_as_df(location_query)

2025-08-08 17:30:26,377 - side - DEBUG - Attempting to read SQL file: sql_files/user_location.sql
2025-08-08 17:30:26,378 - side - INFO - Successfully read SQL file: sql_files/user_location.sql
2025-08-08 17:30:29,369 - read_shift - INFO - Connected to Redshift successfully.
2025-08-08 17:32:39,407 - read_shift - INFO - Query executed successfully, retrieved 665834 rows.
2025-08-08 17:32:39,409 - read_shift - INFO - Connection closed.


In [154]:
platfrom_query = 'sql_files/user_book_platform.sql'
user_platform = connection.redshift_query_fetching_as_df(platfrom_query)

2025-08-08 17:32:40,815 - side - DEBUG - Attempting to read SQL file: sql_files/user_book_platform.sql
2025-08-08 17:32:40,817 - side - INFO - Successfully read SQL file: sql_files/user_book_platform.sql
2025-08-08 17:32:43,913 - read_shift - INFO - Connected to Redshift successfully.
2025-08-08 17:45:09,288 - read_shift - INFO - Query executed successfully, retrieved 4953951 rows.
2025-08-08 17:45:09,292 - read_shift - INFO - Connection closed.


In [155]:
user_loc.fillna('unk',inplace=True)
# user_df.fillna('unk',inplace=True)

  user_loc.fillna('unk',inplace=True)


In [156]:
user_df.to_csv('user_interaction_data.csv')
user_loc.to_csv('user_location_data.csv')
user_platform.to_csv('user_platform_data.csv')

In [14]:
import pandas as pd


In [15]:
user_df= pd.read_csv('user_interaction_data.csv')
user_loc =pd.read_csv('user_location_data.csv')
user_platform = pd.read_csv('user_platform_data.csv')

In [16]:
user_platform.rename(columns ={'isbn':'book_code'},inplace=True)
user_platform.head()

Unnamed: 0.1,Unnamed: 0,user_id,book_code,book_create_dt,cumulative_web_during_school_hour,cumulative_web_after_school_hour,cumulative_apple_during_school_hour,cumulative_apple_after_school_hour,cumulative_android_during_school_hour,cumulative_android_after_school_hour,cumulative_unk_during_school_hour,cumulative_unk_after_school_hour
0,0,0000761b-73e3-4154-8870-ff18bf21f82a,9781427125576,2025-06-04,1,0,0,0,0,0,0,0
1,1,0000761b-73e3-4154-8870-ff18bf21f82a,9781641287319,2025-06-04,2,0,0,0,0,0,0,0
2,2,0000761b-73e3-4154-8870-ff18bf21f82a,9781648340918,2025-06-04,3,0,0,0,0,0,0,0
3,3,0000d775-cdd2-4f5a-b801-a299fa9311ce,9781625815552,2025-05-06,1,0,0,0,0,0,0,0
4,4,000148d3-4afd-41e5-8cba-6e91b97a4211,9781623957964,2024-11-08,2,0,0,0,0,0,0,0


In [17]:
user_platform.columns

Index(['Unnamed: 0', 'user_id', 'book_code', 'book_create_dt',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour',
       'cumulative_unk_after_school_hour'],
      dtype='object')

In [18]:


df1 = user_platform.loc[:, ~user_platform.columns.duplicated()]
df1.columns

Index(['Unnamed: 0', 'user_id', 'book_code', 'book_create_dt',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour',
       'cumulative_unk_after_school_hour'],
      dtype='object')

In [19]:
user_platform = df1[['user_id', 'book_code', 'book_create_dt',
       'cumulative_web_during_school_hour',
       'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour', 'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour',
       'cumulative_unk_after_school_hour']]

In [20]:
user_platform.columns

Index(['user_id', 'book_code', 'book_create_dt',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour',
       'cumulative_unk_after_school_hour'],
      dtype='object')

In [21]:
user_platform.head()

Unnamed: 0,user_id,book_code,book_create_dt,cumulative_web_during_school_hour,cumulative_web_after_school_hour,cumulative_apple_during_school_hour,cumulative_apple_after_school_hour,cumulative_android_during_school_hour,cumulative_android_after_school_hour,cumulative_unk_during_school_hour,cumulative_unk_after_school_hour
0,0000761b-73e3-4154-8870-ff18bf21f82a,9781427125576,2025-06-04,1,0,0,0,0,0,0,0
1,0000761b-73e3-4154-8870-ff18bf21f82a,9781641287319,2025-06-04,2,0,0,0,0,0,0,0
2,0000761b-73e3-4154-8870-ff18bf21f82a,9781648340918,2025-06-04,3,0,0,0,0,0,0,0
3,0000d775-cdd2-4f5a-b801-a299fa9311ce,9781625815552,2025-05-06,1,0,0,0,0,0,0,0
4,000148d3-4afd-41e5-8cba-6e91b97a4211,9781623957964,2024-11-08,2,0,0,0,0,0,0,0


In [22]:
user_platform['total'] = user_platform['cumulative_web_during_school_hour'] + user_platform['cumulative_web_after_school_hour'] + user_platform['cumulative_apple_during_school_hour']+ user_platform['cumulative_apple_after_school_hour']+ user_platform['cumulative_android_during_school_hour']+ user_platform['cumulative_android_after_school_hour']+ user_platform['cumulative_unk_during_school_hour']+ user_platform['cumulative_unk_after_school_hour']

user_platform['cumulative_web_during_school_hour'] = user_platform['cumulative_web_during_school_hour']/user_platform['total']
user_platform['cumulative_web_after_school_hour']  = user_platform['cumulative_web_after_school_hour'] /user_platform['total']
user_platform['cumulative_apple_during_school_hour'] = user_platform['cumulative_apple_during_school_hour']/user_platform['total']
user_platform['cumulative_apple_after_school_hour'] = user_platform['cumulative_apple_after_school_hour']/user_platform['total']
user_platform['cumulative_android_during_school_hour'] = user_platform['cumulative_android_during_school_hour']/user_platform['total']
user_platform['cumulative_android_after_school_hour'] = user_platform['cumulative_android_after_school_hour']/user_platform['total']
user_platform['cumulative_unk_during_school_hour'] = user_platform['cumulative_unk_during_school_hour']/user_platform['total']
user_platform['cumulative_unk_after_school_hour']  = user_platform['cumulative_unk_after_school_hour']/user_platform['total']

In [23]:
user_platform_final.columns

NameError: name 'user_platform_final' is not defined

In [24]:
user_platform_final = user_platform[['user_id', 'book_code',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour', 'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour', 'cumulative_unk_after_school_hour',
       ]]

In [25]:
user_df.columns

Index(['Unnamed: 0', 'book_code', 'user_id', 'book_create_dt', 'total_pages',
       'max_read_pages', 'latest_to_old_rank', 'theme_name', 'category_name',
       'reading_skill_name', 'language_book', 'book_series'],
      dtype='object')

In [53]:
user_df.dropna(subset=['category_name'],inplace=True)
# user_df[['category_name']].fillna('unk',inplace = True)
user_df['category_name'] = user_df['category_name'].fillna('unk')
user_df['category_name'] = user_df['category_name'].fillna('unk')
user_df['reading_skill_name'] = user_df['reading_skill_name'].fillna('unk')

user_df['total_pages']=user_df['total_pages'].fillna(user_df['total_pages'].median())
user_df['max_read_pages']=user_df['max_read_pages'].fillna(user_df['max_read_pages'].median())


In [123]:
user_df_v1 = user_df[['book_code', 'user_id','category_name','theme_name','reading_skill_name', 'book_create_dt', 'total_pages',
       'max_read_pages']].copy()

user_df_v1['book_create_dt'] = pd.to_datetime(user_df_v1['book_create_dt'])

user_df_v1.head()



Unnamed: 0,book_code,user_id,category_name,theme_name,reading_skill_name,book_create_dt,total_pages,max_read_pages
0,9781645805953,0000a39c-1e05-4a7f-827b-03c7d1d8b373,"Fantasy & Adventure, Growing Up","Adventure Stories, Crafts & Hobbies",unk,2025-05-02 18:52:56.792455,18.0,2.0
1,9781634401821,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Social Topics,unk,2025-04-28 18:57:12.626122,24.0,13.0
2,9781645806226,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Family & Friends,unk,2024-08-30 18:50:06.521011,12.0,12.0
3,9781039624191,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Early Learning,"Alphabet, Early Concepts",unk,2024-08-30 18:46:38.124909,13.0,13.0
4,9781634401623,0000c0ac-8d7d-4cbb-99bf-8cbcd313476e,Science & Nature,"The Natural World, Fun Science",unk,2024-11-23 21:42:40.253250,32.0,21.0


In [28]:
user_loc.columns

Index(['Unnamed: 0', 'user_id', 'teacher_creation_source', 'country', 'state',
       'zipcode', 'klass_grade_name', 'klass_id', 'classroom_type',
       'teacher_id', 'teacher_create_dt', 'teacher_create_school_year',
       'school_id', 'ac3', 'ac2', 'ac', 'ac1', 'ac0',
       'class_activation_bucket'],
      dtype='object')

In [124]:
user_loc_v1 = user_loc[['user_id','country', 'state', 'zipcode','klass_grade_name','teacher_id','school_id','class_activation_bucket']].copy()
user_loc_v1 .head()

Unnamed: 0,user_id,country,state,zipcode,klass_grade_name,teacher_id,school_id,class_activation_bucket
0,91842674-eb4b-434a-9959-b0dd48650503,US,NC,28625,grade 2,796B55E786A54166A26FDA9864997753,A637C5A49D554119956690B2290535EA,AC2
1,9c3aeff0-3743-4f46-a7f9-064c70ae8fd2,US,NC,28625,grade 2,796B55E786A54166A26FDA9864997753,A637C5A49D554119956690B2290535EA,AC2
2,926db636-c1ff-462a-9549-ea876a9dbe81,US,NC,28625,grade 2,796B55E786A54166A26FDA9864997753,A637C5A49D554119956690B2290535EA,AC2
3,e89632be-68bb-4c32-af3d-15db0cd02009,US,NC,28625,grade 2,796B55E786A54166A26FDA9864997753,A637C5A49D554119956690B2290535EA,AC2
4,ca139747-e72e-430f-875d-b2365b122871,US,NC,28625,grade 2,796B55E786A54166A26FDA9864997753,A637C5A49D554119956690B2290535EA,AC2


In [125]:
user_loc_v1.shape

(665834, 8)

In [126]:
user_loc_v1.user_id.nunique()

665834

In [127]:
user_loc[user_loc.user_id == '4c73f748-26a7-4c2c-96a1-b0f20df590ac'] 

Unnamed: 0.1,Unnamed: 0,user_id,teacher_creation_source,country,state,zipcode,klass_grade_name,klass_id,classroom_type,teacher_id,teacher_create_dt,teacher_create_school_year,school_id,ac3,ac2,ac,ac1,ac0,class_activation_bucket
553460,553460,4c73f748-26a7-4c2c-96a1-b0f20df590ac,clever_btn,US,GA,30141,grade 4,FBC916BB88314A76AB4F0ED74C9565ED,clever,F83AF8668EE042E48EBEE3DABF04BBA3,2020-08-06 18:18:08,2021.0,1BFBB77D2C3B478E969AC62A7428D902,2024-10-14,2024-08-21 00:00:00,2024-08-21 16:34:03.279000,2024-08-21 16:34:03.279000,2024-08-21 16:34:03.279000,AC3


In [128]:
user_loc_v1.user_id.value_counts()

user_id
91842674-eb4b-434a-9959-b0dd48650503    1
8a557e29-6cc8-4361-99d7-68012e270a2d    1
812aba04-6ea7-42e6-b859-a2bd8af58e3b    1
6ae7b4c8-44fc-43c8-9c26-98609c47573a    1
0b4b59eb-ccdb-41c3-8cfd-655ff07ef6a8    1
                                       ..
22f97c4a-4e9e-407e-99f1-b49e1c23ce5f    1
9cad7452-041f-455f-8cea-75da558a2bf6    1
46a553b1-e2d0-457d-bc37-9c8103cbad6d    1
82838b59-3997-4885-80b1-957babe7067d    1
203f54b0-8334-493b-b0d9-d5665fc1a282    1
Name: count, Length: 665834, dtype: int64

In [129]:
user_df.shape

(4933837, 12)

In [130]:
user_platform.head()

Unnamed: 0,user_id,book_code,book_create_dt,cumulative_web_during_school_hour,cumulative_web_after_school_hour,cumulative_apple_during_school_hour,cumulative_apple_after_school_hour,cumulative_android_during_school_hour,cumulative_android_after_school_hour,cumulative_unk_during_school_hour,cumulative_unk_after_school_hour,total
0,0000761b-73e3-4154-8870-ff18bf21f82a,9781427125576,2025-06-04,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0000761b-73e3-4154-8870-ff18bf21f82a,9781641287319,2025-06-04,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0000761b-73e3-4154-8870-ff18bf21f82a,9781648340918,2025-06-04,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0000d775-cdd2-4f5a-b801-a299fa9311ce,9781625815552,2025-05-06,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,000148d3-4afd-41e5-8cba-6e91b97a4211,9781623957964,2024-11-08,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [131]:
user_platform.shape

(4953951, 12)

In [132]:
user_df_v1.head()

Unnamed: 0,book_code,user_id,category_name,theme_name,reading_skill_name,book_create_dt,total_pages,max_read_pages
0,9781645805953,0000a39c-1e05-4a7f-827b-03c7d1d8b373,"Fantasy & Adventure, Growing Up","Adventure Stories, Crafts & Hobbies",unk,2025-05-02 18:52:56.792455,18.0,2.0
1,9781634401821,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Social Topics,unk,2025-04-28 18:57:12.626122,24.0,13.0
2,9781645806226,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Family & Friends,unk,2024-08-30 18:50:06.521011,12.0,12.0
3,9781039624191,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Early Learning,"Alphabet, Early Concepts",unk,2024-08-30 18:46:38.124909,13.0,13.0
4,9781634401623,0000c0ac-8d7d-4cbb-99bf-8cbcd313476e,Science & Nature,"The Natural World, Fun Science",unk,2024-11-23 21:42:40.253250,32.0,21.0


In [133]:
user_raw_df =  user_df_v1.merge(user_loc_v1, how ='left' ,on = 'user_id')
user_raw_df.head()

Unnamed: 0,book_code,user_id,category_name,theme_name,reading_skill_name,book_create_dt,total_pages,max_read_pages,country,state,zipcode,klass_grade_name,teacher_id,school_id,class_activation_bucket
0,9781645805953,0000a39c-1e05-4a7f-827b-03c7d1d8b373,"Fantasy & Adventure, Growing Up","Adventure Stories, Crafts & Hobbies",unk,2025-05-02 18:52:56.792455,18.0,2.0,US,TX,76015,kindergarten,4F34D86AE8C14F52A2FAB2B75C1FED1D,7E27495739944A9281AFA65446A708C9,AC3
1,9781634401821,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Social Topics,unk,2025-04-28 18:57:12.626122,24.0,13.0,US,TX,76015,kindergarten,4F34D86AE8C14F52A2FAB2B75C1FED1D,7E27495739944A9281AFA65446A708C9,AC3
2,9781645806226,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Family & Friends,unk,2024-08-30 18:50:06.521011,12.0,12.0,US,TX,76015,kindergarten,4F34D86AE8C14F52A2FAB2B75C1FED1D,7E27495739944A9281AFA65446A708C9,AC3
3,9781039624191,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Early Learning,"Alphabet, Early Concepts",unk,2024-08-30 18:46:38.124909,13.0,13.0,US,TX,76015,kindergarten,4F34D86AE8C14F52A2FAB2B75C1FED1D,7E27495739944A9281AFA65446A708C9,AC3
4,9781634401623,0000c0ac-8d7d-4cbb-99bf-8cbcd313476e,Science & Nature,"The Natural World, Fun Science",unk,2024-11-23 21:42:40.253250,32.0,21.0,US,IA,51503,grade 2,938D4290473F47FE94BEA2D9CBCE8988,7B7A12E7CDB1481983C3501B88F9AAEA,AC2


In [134]:
user_platform_final['book_code'] = user_platform_final['book_code'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_platform_final['book_code'] = user_platform_final['book_code'].astype('str')


In [135]:
user_raw_df_v1 = user_raw_df.merge(user_platform_final, how ='left' ,on = ['user_id','book_code'])

In [136]:
user_raw_df_v1.head()


Unnamed: 0,book_code,user_id,category_name,theme_name,reading_skill_name,book_create_dt,total_pages,max_read_pages,country,state,...,school_id,class_activation_bucket,cumulative_web_during_school_hour,cumulative_web_after_school_hour,cumulative_apple_during_school_hour,cumulative_apple_after_school_hour,cumulative_android_during_school_hour,cumulative_android_after_school_hour,cumulative_unk_during_school_hour,cumulative_unk_after_school_hour
0,9781645805953,0000a39c-1e05-4a7f-827b-03c7d1d8b373,"Fantasy & Adventure, Growing Up","Adventure Stories, Crafts & Hobbies",unk,2025-05-02 18:52:56.792455,18.0,2.0,US,TX,...,7E27495739944A9281AFA65446A708C9,AC3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9781634401821,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Social Topics,unk,2025-04-28 18:57:12.626122,24.0,13.0,US,TX,...,7E27495739944A9281AFA65446A708C9,AC3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9781645806226,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,Family & Friends,unk,2024-08-30 18:50:06.521011,12.0,12.0,US,TX,...,7E27495739944A9281AFA65446A708C9,AC3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9781039624191,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Early Learning,"Alphabet, Early Concepts",unk,2024-08-30 18:46:38.124909,13.0,13.0,US,TX,...,7E27495739944A9281AFA65446A708C9,AC3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9781634401623,0000c0ac-8d7d-4cbb-99bf-8cbcd313476e,Science & Nature,"The Natural World, Fun Science",unk,2024-11-23 21:42:40.253250,32.0,21.0,US,IA,...,7B7A12E7CDB1481983C3501B88F9AAEA,AC2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
user_raw_df_v1.columns

Index(['book_code', 'user_id', 'category_name', 'theme_name',
       'reading_skill_name', 'book_create_dt', 'total_pages', 'max_read_pages',
       'country', 'state', 'zipcode', 'klass_grade_name', 'teacher_id',
       'school_id', 'class_activation_bucket',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour',
       'cumulative_unk_after_school_hour'],
      dtype='object')

In [122]:
# import pandas as pd

# # Example DataFrame
# df = user_raw_df_v1[['book_code', 'user_id', 'book_create_dt']].copy()


# # Sort by user & time
# df = df.sort_values(['user_id', 'book_create_dt'])

# # Create lag columns for last 3 books
# df['prev1'] = df.groupby('user_id')['book_code'].shift(1)
# df['prev2'] = df.groupby('user_id')['book_code'].shift(2)
# df['prev3'] = df.groupby('user_id')['book_code'].shift(3)

# # Fill NaN with 'unk'
# for i in range(1, 4):
#     df[f'prev{i}'] = df.groupby('user_id')['book_code'].shift(i).astype('string')

# # Fill NaN with 'unk'
# df[['prev1','prev2','prev3']] = df[['prev1','prev2','prev3']].fillna('unk')

# # Combine into comma-separated string, skipping 'unk'
# df['last_books'] = (
#     df[['prev3','prev2','prev1']]
#     .apply(lambda row: ','.join([b for b in row if b != 'unk']), axis=1)
#     .replace('', 'unk')
# )

# # Combine into comma-separated string
# df['last_books'] = (
#     df[['prev3','prev2','prev1']]
#     .apply(lambda x: ','.join([b for b in x if b != 'unk']), axis=1)
#     .replace('', 'unk')  # If all were unk
# )

# # Format timestamp
# # df['timestamp'] = df['timestamp'].dt.strftime('%d %b %Y')

# # Keep only desired columns
# df = df[['user_id','book_code','book_create_dt','last_books']]

# print(df)


In [None]:
import pandas as pd

def last_10_books_fast(df):
    df = df.copy()
    df['book_create_dt'] = pd.to_datetime(df['book_create_dt'])
    df = df.sort_values(['user_id', 'book_create_dt']).reset_index(drop=True)

    # Helper to join last 10 values for each row in a group
    def last_10_join(series):
        out = []
        hist = []
        for val in series:
            out.append(','.join(hist[-10:]) if hist else 'unk')
            hist.append(val)
        return pd.Series(out, index=series.index)

    # Precompute category/theme strings
    # df['cat_str'] = df['category_name'].apply(lambda x: ','.join(x))
    # print(df['cat_str'])
    # df['theme_str'] = df['theme_name'].apply(lambda x: ','.join(x))
    df['cat_str'] = df['category_name']
    # print(df['cat_str'])
    df['theme_str'] = df['theme_name']
    df['rs_str'] = df['reading_skill_name']


    # Vectorized per-group computation (one Python loop per group, not per row globally)
    grouped = df.groupby('user_id', group_keys=False)
    df['last_books'] = grouped['book_code'].transform(last_10_join)
    df['last_category_name'] = grouped['cat_str'].transform(last_10_join)
    df['last_theme_name'] = grouped['theme_str'].transform(last_10_join)
    df['last_reading_skill_name'] = grouped['rs_str'].transform(last_10_join)

    return df.drop(columns=['cat_str', 'theme_str','rs_str'])


# Example



In [139]:

import time 

x = time.time()
cv = user_raw_df_v1 .copy()

user_raw_df_v2 = last_10_books_fast(cv)

y= time.time()

print(y-x)
user_raw_df_v2.head()

121.91849184036255


Unnamed: 0,book_code,user_id,category_name,theme_name,reading_skill_name,book_create_dt,total_pages,max_read_pages,country,state,...,cumulative_apple_during_school_hour,cumulative_apple_after_school_hour,cumulative_android_during_school_hour,cumulative_android_after_school_hour,cumulative_unk_during_school_hour,cumulative_unk_after_school_hour,last_books,last_category_name,last_theme_name,last_reading_skill_name
0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,Science & Nature,"The Natural World, Fun Science",unk,2024-09-23 20:25:42.214126,32.0,0.0,US,CA,...,0.0,0.0,0.0,0.0,0.0,0.0,unk,unk,unk,unk
1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,Early Learning,"Alphabet, Language Arts","Illustrations or other Visual Elements, Decodi...",2024-09-24 20:01:06.548406,9.0,9.0,US,CA,...,0.0,0.0,0.0,0.0,0.0,0.0,9781634401647,Science & Nature,"The Natural World, Fun Science",unk
2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,Growing Up,"Occupations, Social Studies","Illustrations or other Visual Elements, Decodi...",2024-09-24 20:02:48.414486,10.0,10.0,US,CA,...,0.0,0.0,0.0,0.0,0.0,0.0,97816344016479781039673106,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De..."
3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,People & Places,Biography & Memoir,"Problem and Solution Relationship, Illustratio...",2024-09-24 20:04:49.676354,23.0,0.0,US,CA,...,0.0,0.0,0.0,0.0,0.0,0.0,978163440164797810396731069781039837843,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De..."
4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,People & Places,Biography & Memoir,"Making Inferences, Illustrations or other Visu...",2024-09-24 20:05:17.458130,23.0,23.0,US,CA,...,0.0,0.0,0.0,0.0,0.0,0.0,"9781634401647,9781039673106,9781039837843,9781...","Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De..."


In [None]:
# user_raw_df_v2 = user_raw_df_v1.merge(df, how ='left' ,on = ['user_id','book_code','book_create_dt'])

In [31]:
user_raw_df_v2.shape

(4953951, 22)

In [32]:
user_raw_df_v2.head(20)

Unnamed: 0,book_code,user_id,category_name,book_create_dt,total_pages,max_read_pages,country,state,zipcode,klass_grade_name,...,class_activation_bucket,cumulative_web_during_school_hour,cumulative_web_after_school_hour,cumulative_apple_during_school_hour,cumulative_apple_after_school_hour,cumulative_android_during_school_hour,cumulative_android_after_school_hour,cumulative_unk_during_school_hour,cumulative_unk_after_school_hour,last_books
0,9781645805953,0000a39c-1e05-4a7f-827b-03c7d1d8b373,"Fantasy & Adventure, Growing Up",2025-05-02 18:52:56.792455,18.0,2.0,US,TX,76015,kindergarten,...,AC3,4,0,0,0,0,0,0,0,978103962419197816458062269781634401821
1,9781634401821,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,2025-04-28 18:57:12.626122,24.0,13.0,US,TX,76015,kindergarten,...,AC3,3,0,0,0,0,0,0,0,97810396241919781645806226
2,9781645806226,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Growing Up,2024-08-30 18:50:06.521011,12.0,12.0,US,TX,76015,kindergarten,...,AC3,2,0,0,0,0,0,0,0,9781039624191
3,9781039624191,0000a39c-1e05-4a7f-827b-03c7d1d8b373,Early Learning,2024-08-30 18:46:38.124909,13.0,13.0,US,TX,76015,kindergarten,...,AC3,1,0,0,0,0,0,0,0,unk
4,9781634401623,0000c0ac-8d7d-4cbb-99bf-8cbcd313476e,Science & Nature,2024-11-23 21:42:40.253250,32.0,21.0,US,IA,51503,grade 2,...,AC2,0,2,0,0,0,0,0,0,unk
5,9781645806851,0000d725-0891-49f3-a6f4-15f533093e48,Fairy Tales & Folklore,2025-05-07 19:18:03.938737,25.0,25.0,US,AR,71964,grade 2,...,AC3,11,0,0,0,0,0,2,0,978103965526397815324022589781427151841
6,9781427151841,0000d725-0891-49f3-a6f4-15f533093e48,"Animals, Fairy Tales & Folklore",2024-11-19 20:32:29.600148,13.0,12.0,US,AR,71964,grade 2,...,AC3,9,0,0,0,0,0,2,0,978193616394697810396552639781532402258
7,9781532402258,0000d725-0891-49f3-a6f4-15f533093e48,"Early Learning, Funny Stories",2024-11-13 19:48:11.543280,15.0,4.0,US,AR,71964,grade 2,...,AC3,8,0,0,0,0,0,1,0,978162581551497819361639469781039655263
8,9781039655263,0000d725-0891-49f3-a6f4-15f533093e48,Growing Up,2024-11-13 19:03:00.652951,29.0,12.0,US,AR,71964,grade 2,...,AC3,8,0,0,0,0,0,2,0,978142717737797816258155149781936163946
9,9781936163946,0000d725-0891-49f3-a6f4-15f533093e48,Fairy Tales & Folklore,2024-10-29 18:04:22.543274,32.0,28.0,US,AR,71964,grade 2,...,AC3,5,0,0,0,0,0,1,0,978153244267497814271773779781625815514


In [140]:
user_raw_df_v2.columns

Index(['book_code', 'user_id', 'category_name', 'theme_name',
       'reading_skill_name', 'book_create_dt', 'total_pages', 'max_read_pages',
       'country', 'state', 'zipcode', 'klass_grade_name', 'teacher_id',
       'school_id', 'class_activation_bucket',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour', 'cumulative_unk_after_school_hour',
       'last_books', 'last_category_name', 'last_theme_name',
       'last_reading_skill_name'],
      dtype='object')

In [141]:
user_raw_df_v2.klass_grade_name.unique()

array(['grade 1', 'grade 4', 'kindergarten', 'grade 2', 'grade 3',
       'grade 5', nan], dtype=object)

In [142]:
user_raw_df_v2 = pd.get_dummies(user_raw_df_v2, columns=['klass_grade_name'], prefix='grade')
user_raw_df_v2 = pd.get_dummies(user_raw_df_v2, columns=['class_activation_bucket'], prefix='class_activation_bucket')

user_raw_df_v2.head()

Unnamed: 0,book_code,user_id,category_name,theme_name,reading_skill_name,book_create_dt,total_pages,max_read_pages,country,state,...,grade_grade 3,grade_grade 4,grade_grade 5,grade_kindergarten,class_activation_bucket_AC,class_activation_bucket_AC0,class_activation_bucket_AC1,class_activation_bucket_AC2,class_activation_bucket_AC3,class_activation_bucket_unk
0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,Science & Nature,"The Natural World, Fun Science",unk,2024-09-23 20:25:42.214126,32.0,0.0,US,CA,...,False,False,False,False,False,False,False,False,True,False
1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,Early Learning,"Alphabet, Language Arts","Illustrations or other Visual Elements, Decodi...",2024-09-24 20:01:06.548406,9.0,9.0,US,CA,...,False,False,False,False,False,False,False,False,True,False
2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,Growing Up,"Occupations, Social Studies","Illustrations or other Visual Elements, Decodi...",2024-09-24 20:02:48.414486,10.0,10.0,US,CA,...,False,False,False,False,False,False,False,False,True,False
3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,People & Places,Biography & Memoir,"Problem and Solution Relationship, Illustratio...",2024-09-24 20:04:49.676354,23.0,0.0,US,CA,...,False,False,False,False,False,False,False,False,True,False
4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,People & Places,Biography & Memoir,"Making Inferences, Illustrations or other Visu...",2024-09-24 20:05:17.458130,23.0,23.0,US,CA,...,False,False,False,False,False,False,False,False,True,False


In [138]:
# import pandas as pd

# # Example input
# df = user_raw_df_v2[['book_code', 'user_id', 'category_name', 'book_create_dt']].copy()

# # Convert to datetime before sorting
# df['book_create_dt'] = pd.to_datetime(df['book_create_dt'])

# # Ensure category_name is a list
# df['category_name'] = df['category_name'].apply(lambda x: x if isinstance(x, list) else x.split(','))

# # Unique values
# all_categories = sorted(set(cat for sublist in df['category_name'] for cat in sublist))
# # all_book_types = sorted(df['book_type'].unique())
# # all_languages = sorted(df['language'].unique())

# # Sort
# df = df.sort_values(by=['user_id', 'book_create_dt']).reset_index(drop=True)

# # Output list
# output = []

# # Process per user
# for user_id, user_df in df.groupby('user_id', sort=False):
#     last_read_cat = {cat: None for cat in all_categories}
#     read_count_cat = {cat: 0 for cat in all_categories}
    
#     # last_read_type = {btype: None for btype in all_book_types}
#     # read_count_type = {btype: 0 for btype in all_book_types}

#     # last_read_lang = {lang: None for lang in all_languages}
#     # read_count_lang = {lang: 0 for lang in all_languages}

#     total_books_read = 0  # Track number of books read so far by user

#     for _, row in user_df.iterrows():
#         book_dt = row['book_create_dt']
#         # book_type = row['book_type']
#         # language = row['language']
#         current_categories = row['category_name']

#         row_out = {
#             'user_id': user_id,
#             'book_code': row['book_code'],
#             'book_create_dt': book_dt,
#         }

#         # Categories
#         for cat in all_categories:
#             count = read_count_cat[cat]
#             row_out[f'{cat}_count'] = count
#             row_out[f'{cat}_days'] = 1/(1+(book_dt - last_read_cat[cat]).days) if last_read_cat[cat] else 0
#             row_out[f'{cat}_pct'] = (count / total_books_read) if total_books_read > 0 else 0

#         # Book types
#         # for btype in all_book_types:
#         #     count = read_count_type[btype]
#         #     row_out[f'{btype}_count'] = count
#         #     row_out[f'{btype}_days'] = (book_dt - last_read_type[btype]).days if last_read_type[btype] else 0
#         #     row_out[f'{btype}_pct'] = (count / total_books_read) if total_books_read > 0 else 0

#         # Languages
#         # for lang in all_languages:
#         #     count = read_count_lang[lang]
#         #     row_out[f'{lang}_count'] = count
#         #     row_out[f'{lang}_days'] = (book_dt - last_read_lang[lang]).days if last_read_lang[lang] else 0
#         #     row_out[f'{lang}_pct'] = (count / total_books_read) if total_books_read > 0 else 0

#         # Update state
#         for cat in current_categories:
#             read_count_cat[cat] += 1
#             last_read_cat[cat] = book_dt

#         # read_count_type[book_type] += 1
#         # last_read_type[book_type] = book_dt

#         # read_count_lang[language] += 1
#         # last_read_lang[language] = book_dt

#         total_books_read += 1

#         output.append(row_out)

# # Final DataFrame
# result_df = pd.DataFrame(output)


In [144]:
user_raw_df_v2[['total_pages', 'max_read_pages']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4933837 entries, 0 to 4933836
Data columns (total 2 columns):
 #   Column          Dtype  
---  ------          -----  
 0   total_pages     float64
 1   max_read_pages  float64
dtypes: float64(2)
memory usage: 75.3 MB


In [133]:
import numpy as np

In [146]:
user_raw_df_v2['completion_rate'] = user_raw_df_v2['max_read_pages']/user_raw_df_v2['total_pages']
user_raw_df_v2['label'] = np.where(user_raw_df_v2['completion_rate']>0.5,1,0)
user_raw_df_v2.columns

Index(['book_code', 'user_id', 'category_name', 'theme_name',
       'reading_skill_name', 'book_create_dt', 'total_pages', 'max_read_pages',
       'country', 'state', 'zipcode', 'teacher_id', 'school_id',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour', 'cumulative_unk_after_school_hour',
       'last_books', 'last_category_name', 'last_theme_name',
       'last_reading_skill_name', 'grade_grade 1', 'grade_grade 2',
       'grade_grade 3', 'grade_grade 4', 'grade_grade 5', 'grade_kindergarten',
       'class_activation_bucket_AC', 'class_activation_bucket_AC0',
       'class_activation_bucket_AC1', 'class_activation_bucket_AC2',
       'class_activation_bucket_AC3', 'class_activation_bucket_unk',
       'completion_rate', 'labe

In [149]:
user_columns = ['book_code', 'user_id', 'book_create_dt','country', 'state', 'zipcode',
       'teacher_id', 'school_id','cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour', 'cumulative_unk_after_school_hour',
       'last_books', 'grade_grade 1', 'grade_grade 2', 'grade_grade 3',
       'grade_grade 4', 'grade_grade 5', 'grade_kindergarten', 
       'class_activation_bucket_AC', 'class_activation_bucket_AC0',
       'class_activation_bucket_AC1', 'class_activation_bucket_AC2',
       'class_activation_bucket_AC3', 'class_activation_bucket_unk', 'last_books', 'last_category_name', 'last_theme_name',
       'last_reading_skill_name','label']

In [150]:
user_features_df = user_raw_df_v2[user_columns]
user_features_df.shape

(4933837, 34)

In [151]:
user_features_df.to_csv('user_features_final_transformed.csv')

# END feature Engineering

In [99]:
user_df['book_code'] = user_df['book_code'].astype('str')
user_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4933837 entries, 0 to 4953950
Data columns (total 12 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   book_code           object 
 2   user_id             object 
 3   book_create_dt      object 
 4   total_pages         float64
 5   max_read_pages      float64
 6   latest_to_old_rank  int64  
 7   theme_name          object 
 8   category_name       object 
 9   reading_skill_name  object 
 10  language_book       object 
 11  book_series         object 
dtypes: float64(2), int64(2), object(8)
memory usage: 489.3+ MB


In [118]:
import pandas as pd

def last_10_books_fast(df):
    df = df.copy()
    df['book_create_dt'] = pd.to_datetime(df['book_create_dt'])
    df = df.sort_values(['user_id', 'book_create_dt']).reset_index(drop=True)

    # Helper to join last 10 values for each row in a group
    def last_10_join(series):
        out = []
        hist = []
        for val in series:
            out.append(','.join(hist[-10:]) if hist else 'unk')
            hist.append(val)
        return pd.Series(out, index=series.index)

    # Precompute category/theme strings
    # df['cat_str'] = df['category_name'].apply(lambda x: ','.join(x))
    # print(df['cat_str'])
    # df['theme_str'] = df['theme_name'].apply(lambda x: ','.join(x))
    df['cat_str'] = df['category_name']
    # print(df['cat_str'])
    df['theme_str'] = df['theme_name']
    df['rs_str'] = df['reading_skill_name']


    # Vectorized per-group computation (one Python loop per group, not per row globally)
    grouped = df.groupby('user_id', group_keys=False)
    df['last_books'] = grouped['book_code'].transform(last_10_join)
    df['last_category_name'] = grouped['cat_str'].transform(last_10_join)
    df['last_theme_name'] = grouped['theme_str'].transform(last_10_join)
    df['last_reading_skill_name'] = grouped['rs_str'].transform(last_10_join)

    return df.drop(columns=['cat_str', 'theme_str','rs_str'])


# Example



In [119]:
import time 

x = time.time()
df = pd.DataFrame({
    'user_id': ['A', 'A', 'A', 'A'],
    'book_code': ['1223', '232', '231', '312'],
    'book_create_dt': ['2025-07-01', '2025-07-04', '2025-07-06', '2025-07-08'],
    'category_name': [['C1', 'C2'], ['C2', 'C3'], ['C3'], ['C2', 'C4']],
    'theme_name': [['T1', 'T2'], ['T2', 'T3'], ['T3'], ['T4']]
})

print(last_10_books_fast(df))

y= time.time()

print(y-x)

KeyError: 'reading_skill_name'

In [120]:
df = user_df[:10].copy()

last_10_books_fast(df).head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,total_pages,max_read_pages,latest_to_old_rank,theme_name,category_name,reading_skill_name,language_book,book_series,last_books,last_category_name,last_theme_name,last_reading_skill_name
0,3,9781039624191,0000a39c-1e05-4a7f-827b-03c7d1d8b373,2024-08-30 18:46:38.124909,13.0,13.0,4,"Alphabet, Early Concepts",Early Learning,unk,English,My Phonics Readers - I See My ABCs,unk,unk,unk,unk
1,2,9781645806226,0000a39c-1e05-4a7f-827b-03c7d1d8b373,2024-08-30 18:50:06.521011,12.0,12.0,3,Family & Friends,Growing Up,unk,English,Life Skills For Kids,9781039624191,Early Learning,"Alphabet, Early Concepts",unk
2,1,9781634401821,0000a39c-1e05-4a7f-827b-03c7d1d8b373,2025-04-28 18:57:12.626122,24.0,13.0,2,Social Topics,Growing Up,unk,English,Growing Up,97810396241919781645806226,"Early Learning,Growing Up","Alphabet, Early Concepts,Family & Friends","unk,unk"
3,0,9781645805953,0000a39c-1e05-4a7f-827b-03c7d1d8b373,2025-05-02 18:52:56.792455,18.0,2.0,1,"Adventure Stories, Crafts & Hobbies","Fantasy & Adventure, Growing Up",unk,English,Fun Experiences For Kids,978103962419197816458062269781634401821,"Early Learning,Growing Up,Growing Up","Alphabet, Early Concepts,Family & Friends,Soci...","unk,unk,unk"
4,4,9781634401623,0000c0ac-8d7d-4cbb-99bf-8cbcd313476e,2024-11-23 21:42:40.253250,32.0,21.0,1,"The Natural World, Fun Science",Science & Nature,unk,English,Imagine That!,unk,unk,unk,unk


In [113]:
import time 

x = time.time()
df = user_df[:10].copy()

print(last_10_books_fast(df))

y= time.time()

print(y-x)

   Unnamed: 0      book_code                               user_id  \
0           3  9781039624191  0000a39c-1e05-4a7f-827b-03c7d1d8b373   
1           2  9781645806226  0000a39c-1e05-4a7f-827b-03c7d1d8b373   
2           1  9781634401821  0000a39c-1e05-4a7f-827b-03c7d1d8b373   
3           0  9781645805953  0000a39c-1e05-4a7f-827b-03c7d1d8b373   
4           4  9781634401623  0000c0ac-8d7d-4cbb-99bf-8cbcd313476e   
5           9  9781936163946  0000d725-0891-49f3-a6f4-15f533093e48   
6           8  9781039655263  0000d725-0891-49f3-a6f4-15f533093e48   
7           7  9781532402258  0000d725-0891-49f3-a6f4-15f533093e48   
8           6  9781427151841  0000d725-0891-49f3-a6f4-15f533093e48   
9           5  9781645806851  0000d725-0891-49f3-a6f4-15f533093e48   

              book_create_dt  total_pages  max_read_pages  latest_to_old_rank  \
0 2024-08-30 18:46:38.124909         13.0            13.0                   4   
1 2024-08-30 18:50:06.521011         12.0            12.0          

In [121]:
import time 

x = time.time()
df = user_df.copy()

cv = last_10_books_fast(df)

y= time.time()

print(y-x)
cv.head()

109.19962978363037


Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,total_pages,max_read_pages,latest_to_old_rank,theme_name,category_name,reading_skill_name,language_book,book_series,last_books,last_category_name,last_theme_name,last_reading_skill_name
0,13515,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,32.0,0.0,9,"The Natural World, Fun Science",Science & Nature,unk,English,Imagine That!,unk,unk,unk,unk
1,13514,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,9.0,9.0,8,"Alphabet, Language Arts",Early Learning,"Illustrations or other Visual Elements, Decodi...",English,My Phonics Words - Word Families,9781634401647,Science & Nature,"The Natural World, Fun Science",unk
2,13513,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,10.0,10.0,7,"Occupations, Social Studies",Growing Up,"Illustrations or other Visual Elements, Decodi...",English,Words in My World,97816344016479781039673106,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De..."
3,13512,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,23.0,0.0,6,Biography & Memoir,People & Places,"Problem and Solution Relationship, Illustratio...",English,LOOK! Books: Beginner Biographies,978163440164797810396731069781039837843,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De..."
4,13511,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,23.0,23.0,5,Biography & Memoir,People & Places,"Making Inferences, Illustrations or other Visu...",English,LOOK! Books: Beginner Biographies,"9781634401647,9781039673106,9781039837843,9781...","Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De..."


# Feature Loader

In [25]:
import pandas as pd

In [26]:
child_df = pd.read_csv('user_features_final_transformed.csv')
item_df =  pd.read_csv('final_book_level_data.csv')


  item_df =  pd.read_csv('final_book_level_data.csv')


In [27]:
print(f"user data shape {child_df.shape}")
print(f"book data shape {item_df.shape}")

user data shape (4933837, 35)
book data shape (12189, 1168)


In [28]:
item_df.rename(columns={'book_isbn':'book_code'},inplace=True)

In [29]:
child_columns = ['book_code', 'user_id', 'book_create_dt', 'country',
       'state', 'zipcode', 'teacher_id', 'school_id',
       'cumulative_web_during_school_hour', 'cumulative_web_after_school_hour',
       'cumulative_apple_during_school_hour',
       'cumulative_apple_after_school_hour',
       'cumulative_android_during_school_hour',
       'cumulative_android_after_school_hour',
       'cumulative_unk_during_school_hour', 'cumulative_unk_after_school_hour',
        'grade_grade 1', 'grade_grade 2', 'grade_grade 3',
       'grade_grade 4', 'grade_grade 5', 'grade_kindergarten',
       'class_activation_bucket_AC', 'class_activation_bucket_AC0',
       'class_activation_bucket_AC1', 'class_activation_bucket_AC2',
       'class_activation_bucket_AC3', 'class_activation_bucket_unk',
       'last_books.1', 'last_category_name', 'last_theme_name',
       'last_reading_skill_name', 'label']

item_columns =  list(item_df.columns)




In [7]:
final_interaction_df =  child_df.merge(item_df, on="book_code", how="left")

: 

In [35]:
item_df.theme_name.values

array(['History of America, Family & Friends, Social Topics',
       'Healthy Habits, Family & Friends, Occupations',
       'Places of Interest, The Natural World, History', ...,
       'Funny Stories, Our Friends in Nature, Family & Friends',
       'Our Friends in Nature', 'Funny Stories, Our Friends in Nature'],
      shape=(12189,), dtype=object)

In [32]:
item_df_index = item_df.set_index('book_code')
item_df_index.head()

Unnamed: 0_level_0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,lang_English,lang_French,lang_Haitian French Creole,lang_Mandarin,lang_Portuguese,lang_Spanish,grades,reading_skill_name,theme_name,category_name
book_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9781427162748,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,True,False,False,False,False,False,345,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up"
9780778798057,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,True,False,False,False,False,False,345,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up
9781427165275,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,True,False,False,False,False,False,345,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature"
9781427120335,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,True,False,False,False,False,False,345,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature"
9781427197719,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,True,False,False,False,False,False,345,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up


In [36]:
# Step 1: Drop missing values (optional)
df_clean = book_df['category_name'].dropna()

# Step 2: Split by comma, strip spaces, and flatten
unique_values = set(
    val.strip()
    for sublist in df_clean.str.split(',')
    for val in sublist
)

print(unique_values)
print(len(unique_values))


{'Early Learning', 'Fantasy & Adventure', 'People & Places', 'Science & Nature', 'Fairy Tales & Folklore', 'Animals', 'Funny Stories', 'Growing Up'}
8


In [37]:
sd=  book_df[['theme_name']].copy()
sd.head()

Unnamed: 0,theme_name
0,"History of America, Family & Friends, Social T..."
1,"Healthy Habits, Family & Friends, Occupations"
2,"Places of Interest, The Natural World, History"
3,"Occupations, Technology"
4,"Sports & Games, Fitness, Healthy Habits"


In [43]:
# Step 1: Preprocess themes (split on commas)
item_df['themes'] = item_df['theme_name'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
# Step 2: Build theme vocabulary
from itertools import chain
all_themes = sorted(set(chain.from_iterable(item_df['themes'])))
theme_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in theme_to_idx:
    theme_to_idx['unk'] = len(theme_to_idx)

# Step 3: Map themes to indices
item_df['theme_ids'] = item_df['themes'].apply(
    lambda theme_list: [theme_to_idx[t] for t in theme_list if t in theme_to_idx]
)
item_df.head()

Unnamed: 0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,lang_Mandarin,lang_Portuguese,lang_Spanish,book_code,grades,reading_skill_name,theme_name,category_name,themes,theme_ids
0,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,False,False,False,9781427162748,345,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up","[history of america, family & friends, social ...","[52, 34, 91]"
1,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,False,False,False,9780778798057,345,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up,"[healthy habits, family & friends, occupations]","[48, 34, 73]"
2,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,False,False,False,9781427165275,345,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature","[places of interest, the natural world, history]","[79, 98, 51]"
3,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,False,False,False,9781427120335,345,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature","[occupations, technology]","[73, 97]"
4,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,False,False,False,9781427197719,345,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up,"[sports & games, fitness, healthy habits]","[95, 38, 48]"


In [None]:
'grades','reading_skill_name','theme_name', 'category_name'

In [44]:
# Step 1: Preprocess themes (split on commas)
item_df['categories'] = item_df['category_name'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
# Step 2: Build theme vocabulary
from itertools import chain
all_themes = sorted(set(chain.from_iterable(item_df['categories'])))
category_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in category_to_idx:
    category_to_idx['unk'] = len(category_to_idx)

# Step 3: Map themes to indices
item_df['category_ids'] = item_df['categories'].apply(
    lambda theme_list: [category_to_idx[t] for t in theme_list if t in category_to_idx]
)
item_df.head()

Unnamed: 0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,lang_Spanish,book_code,grades,reading_skill_name,theme_name,category_name,themes,theme_ids,categories,category_ids
0,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,False,9781427162748,345,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up","[history of america, family & friends, social ...","[52, 34, 91]","[people & places, growing up]","[6, 5]"
1,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,False,9780778798057,345,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up,"[healthy habits, family & friends, occupations]","[48, 34, 73]",[growing up],[5]
2,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,False,9781427165275,345,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature","[places of interest, the natural world, history]","[79, 98, 51]","[people & places, science & nature]","[6, 7]"
3,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,False,9781427120335,345,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature","[occupations, technology]","[73, 97]","[growing up, science & nature]","[5, 7]"
4,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,False,9781427197719,345,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up,"[sports & games, fitness, healthy habits]","[95, 38, 48]",[growing up],[5]


In [45]:
# Step 1: Preprocess themes (split on commas)
item_df['reading_skills'] = item_df['reading_skill_name'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
# Step 2: Build theme vocabulary
from itertools import chain
all_themes = sorted(set(chain.from_iterable(item_df['reading_skills'])))
reading_skill_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in reading_skill_to_idx:
    reading_skill_to_idx['unk'] = len(reading_skill_to_idx)

# Step 3: Map themes to indices
item_df['reading_skill_ids'] = item_df['reading_skills'].apply(
    lambda theme_list: [reading_skill_to_idx[t] for t in theme_list if t in reading_skill_to_idx]
)
item_df.head()


Unnamed: 0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,grades,reading_skill_name,theme_name,category_name,themes,theme_ids,categories,category_ids,reading_skills,reading_skill_ids
0,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,345,"Illustrations or other Visual Elements, Fact a...","History of America, Family & Friends, Social T...","People & Places, Growing Up","[history of america, family & friends, social ...","[52, 34, 91]","[people & places, growing up]","[6, 5]","[illustrations or other visual elements, fact ...","[7, 6, 5, 4, 3, 1]"
1,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,345,"Point of View, Making Inferences, Main Idea & ...","Healthy Habits, Family & Friends, Occupations",Growing Up,"[healthy habits, family & friends, occupations]","[48, 34, 73]",[growing up],[5],"[point of view, making inferences, main idea &...","[10, 9, 8, 7, 6, 5, 4, 3]"
2,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,345,"Making Inferences, Illustrations or other Visu...","Places of Interest, The Natural World, History","People & Places, Science & Nature","[places of interest, the natural world, history]","[79, 98, 51]","[people & places, science & nature]","[6, 7]","[making inferences, illustrations or other vis...","[9, 7, 6, 5, 4, 3, 1]"
3,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,345,"Making Inferences, Illustrations or other Visu...","Occupations, Technology","Growing Up, Science & Nature","[occupations, technology]","[73, 97]","[growing up, science & nature]","[5, 7]","[making inferences, illustrations or other vis...","[9, 7, 6, 5, 3, 1, 0]"
4,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,345,"Making Inferences, Main Idea & Key Details, Il...","Sports & Games, Fitness, Healthy Habits",Growing Up,"[sports & games, fitness, healthy habits]","[95, 38, 48]",[growing up],[5],"[making inferences, main idea & key details, i...","[9, 8, 7, 5, 3]"


In [47]:
# Step 1: Preprocess themes (split on commas)
item_df['grades_list'] = item_df['grades'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
# Step 2: Build theme vocabulary
from itertools import chain
all_themes = sorted(set(chain.from_iterable(item_df['grades_list'])))
grades_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in grades_to_idx:
    grades_to_idx['unk'] = len(grades_to_idx)

# Step 3: Map themes to indices
item_df['grades_ids'] = item_df['grades_list'].apply(
    lambda theme_list: [grades_to_idx[t] for t in theme_list if t in grades_to_idx]
)
item_df.head()


Unnamed: 0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,theme_name,category_name,themes,theme_ids,categories,category_ids,reading_skills,reading_skill_ids,grades_list,grades_ids
0,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,"History of America, Family & Friends, Social T...","People & Places, Growing Up","[history of america, family & friends, social ...","[52, 34, 91]","[people & places, growing up]","[6, 5]","[illustrations or other visual elements, fact ...","[7, 6, 5, 4, 3, 1]","[3, 4, 5]","[2, 3, 4]"
1,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,"Healthy Habits, Family & Friends, Occupations",Growing Up,"[healthy habits, family & friends, occupations]","[48, 34, 73]",[growing up],[5],"[point of view, making inferences, main idea &...","[10, 9, 8, 7, 6, 5, 4, 3]","[3, 4, 5]","[2, 3, 4]"
2,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,"Places of Interest, The Natural World, History","People & Places, Science & Nature","[places of interest, the natural world, history]","[79, 98, 51]","[people & places, science & nature]","[6, 7]","[making inferences, illustrations or other vis...","[9, 7, 6, 5, 4, 3, 1]","[3, 4, 5]","[2, 3, 4]"
3,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,"Occupations, Technology","Growing Up, Science & Nature","[occupations, technology]","[73, 97]","[growing up, science & nature]","[5, 7]","[making inferences, illustrations or other vis...","[9, 7, 6, 5, 3, 1, 0]","[3, 4, 5]","[2, 3, 4]"
4,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,"Sports & Games, Fitness, Healthy Habits",Growing Up,"[sports & games, fitness, healthy habits]","[95, 38, 48]",[growing up],[5],"[making inferences, main idea & key details, i...","[9, 8, 7, 5, 3]","[3, 4, 5]","[2, 3, 4]"


In [51]:

book_code_to_idx = {theme: idx for idx, theme in enumerate(list((item_df.book_code)))}
if 'unk' not in book_code_to_idx:
    book_code_to_idx['unk'] = len(book_code_to_idx)

# Step 3: Map themes to indices
item_df['book_code_ids'] = item_df['book_code'].apply(
    lambda theme_list: [book_code_to_idx[t] for t in [theme_list] if t in book_code_to_idx]
)
item_df.head()


Unnamed: 0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,category_name,themes,theme_ids,categories,category_ids,reading_skills,reading_skill_ids,grades_list,grades_ids,book_code_ids
0,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,"People & Places, Growing Up","[history of america, family & friends, social ...","[52, 34, 91]","[people & places, growing up]","[6, 5]","[illustrations or other visual elements, fact ...","[7, 6, 5, 4, 3, 1]","[3, 4, 5]","[2, 3, 4]",[0]
1,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,Growing Up,"[healthy habits, family & friends, occupations]","[48, 34, 73]",[growing up],[5],"[point of view, making inferences, main idea &...","[10, 9, 8, 7, 6, 5, 4, 3]","[3, 4, 5]","[2, 3, 4]",[1]
2,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,"People & Places, Science & Nature","[places of interest, the natural world, history]","[79, 98, 51]","[people & places, science & nature]","[6, 7]","[making inferences, illustrations or other vis...","[9, 7, 6, 5, 4, 3, 1]","[3, 4, 5]","[2, 3, 4]",[2]
3,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,"Growing Up, Science & Nature","[occupations, technology]","[73, 97]","[growing up, science & nature]","[5, 7]","[making inferences, illustrations or other vis...","[9, 7, 6, 5, 3, 1, 0]","[3, 4, 5]","[2, 3, 4]",[3]
4,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,Growing Up,"[sports & games, fitness, healthy habits]","[95, 38, 48]",[growing up],[5],"[making inferences, main idea & key details, i...","[9, 8, 7, 5, 3]","[3, 4, 5]","[2, 3, 4]",[4]


In [54]:
# [ 'book_type','min_grade', 'max_grade','readable_page_count','fiction_nonfiction', 'reading_skill_name','theme_name', 'category_name','language_book']

columns_author_title =[f"emb_title_author_{i}" for i in range(emb.shape[1])]
columns_long_description = [f"emb_desc_{i}" for i in range(emb_desc.shape[1])]
columns_book_series = [f"emb_book_series_{i}" for i in range(emb_book_series.shape[1])]
columns_add = ['readable_page_count','book_type_binary', 'fn_Fiction', 'fn_Non-Fiction', 'fn_unk',
       'lang_English', 'lang_French', 'lang_Haitian French Creole',
       'lang_Mandarin', 'lang_Portuguese', 'lang_Spanish']

columns_learn_emb = [ 'book_code','book_code_ids','grades_ids','reading_skill_ids', 'category_ids','theme_ids']

final_columns = columns_author_title + columns_long_description + columns_book_series + columns_add + columns_learn_emb

len(final_columns)

1169

In [57]:
count_variables = {'themes':len(theme_to_idx), 
                   'book_count': len(book_code_to_idx), 
                   'grade_count': len(grades_to_idx),
                   'reading_skills_count':len(reading_skill_to_idx),
                   'category_count':len(category_to_idx)
                   }
count_variables

{'themes': 108,
 'book_count': 12190,
 'grade_count': 11,
 'reading_skills_count': 13,
 'category_count': 9}

In [55]:
item_df_final =  item_df[final_columns]
item_df_final.shape

(12189, 1169)

In [63]:
item_df_index = item_df_final.set_index('book_code')
item_df_index.head()

Unnamed: 0_level_0,emb_title_author_0,emb_title_author_1,emb_title_author_2,emb_title_author_3,emb_title_author_4,emb_title_author_5,emb_title_author_6,emb_title_author_7,emb_title_author_8,emb_title_author_9,...,lang_French,lang_Haitian French Creole,lang_Mandarin,lang_Portuguese,lang_Spanish,book_code_ids,grades_ids,reading_skill_ids,category_ids,theme_ids
book_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9781427162748,-0.154799,0.00972,-0.115303,-0.006852,-0.02777,0.077718,0.076082,-0.064394,-0.022539,0.013681,...,False,False,False,False,False,[0],"[2, 3, 4]","[7, 6, 5, 4, 3, 1]","[6, 5]","[52, 34, 91]"
9780778798057,-0.027353,0.045183,-0.05682,-0.030893,-0.008478,0.04454,0.01523,-0.058684,0.006394,0.02675,...,False,False,False,False,False,[1],"[2, 3, 4]","[10, 9, 8, 7, 6, 5, 4, 3]",[5],"[48, 34, 73]"
9781427165275,-0.03881,0.080689,0.025751,-0.001341,0.029363,0.023188,0.042705,-0.079479,-0.081301,0.019871,...,False,False,False,False,False,[2],"[2, 3, 4]","[9, 7, 6, 5, 4, 3, 1]","[6, 7]","[79, 98, 51]"
9781427120335,0.001933,-0.01636,0.033735,0.097107,-0.01865,-0.050314,0.041386,-0.066138,-0.034151,-0.068921,...,False,False,False,False,False,[3],"[2, 3, 4]","[9, 7, 6, 5, 3, 1, 0]","[5, 7]","[73, 97]"
9781427197719,-0.068801,0.027607,-0.060682,-0.027829,0.004315,0.038051,0.043159,0.000348,-0.097616,-0.017611,...,False,False,False,False,False,[4],"[2, 3, 4]","[9, 8, 7, 5, 3]",[5],"[95, 38, 48]"


In [64]:
item_df_index.loc[9781427162748]

emb_title_author_0             -0.154799
emb_title_author_1               0.00972
emb_title_author_2             -0.115303
emb_title_author_3             -0.006852
emb_title_author_4              -0.02777
                             ...        
book_code_ids                        [0]
grades_ids                     [2, 3, 4]
reading_skill_ids     [7, 6, 5, 4, 3, 1]
category_ids                      [6, 5]
theme_ids                   [52, 34, 91]
Name: 9781427162748, Length: 1168, dtype: object

In [195]:
# [ 'book_type','min_grade', 'max_grade','readable_page_count','fiction_nonfiction', 'reading_skill_name','theme_name', 'category_name','language_book']

columns_author_title =[f"emb_title_author_{i}" for i in range(emb.shape[1])]
columns_long_description = [f"emb_desc_{i}" for i in range(emb_desc.shape[1])]
columns_book_series = [f"emb_book_series_{i}" for i in range(emb_book_series.shape[1])]
columns_add = ['readable_page_count','book_type_binary', 'fn_Fiction', 'fn_Non-Fiction', 'fn_unk',
       'lang_English', 'lang_French', 'lang_Haitian French Creole',
       'lang_Mandarin', 'lang_Portuguese', 'lang_Spanish']

columns_learn_emb = ['book_code_ids','grades_ids','reading_skill_ids', 'category_ids','theme_ids']

book_feature_cols = columns_author_title + columns_long_description + columns_book_series + columns_add

len(book_feature_cols)

1163

In [None]:
['country',
 'state',
 'zipcode',
 'teacher_id',
 'school_id',
 'last_books.1',
 'last_category_name',
 'last_theme_name',
 'last_reading_skill_name',]

In [196]:
interaction_feature_cols = ['cumulative_web_during_school_hour',
 'cumulative_web_after_school_hour',
 'cumulative_apple_during_school_hour',
 'cumulative_apple_after_school_hour',
 'cumulative_android_during_school_hour',
 'cumulative_android_after_school_hour',
 'cumulative_unk_during_school_hour',
 'cumulative_unk_after_school_hour',
 'grade_grade 1',
 'grade_grade 2',
 'grade_grade 3',
 'grade_grade 4',
 'grade_grade 5',
 'grade_kindergarten',
 'class_activation_bucket_AC',
 'class_activation_bucket_AC0',
 'class_activation_bucket_AC1',
 'class_activation_bucket_AC2',
 'class_activation_bucket_AC3',
 'class_activation_bucket_unk',]



In [69]:
child_columns[10:]

['cumulative_apple_during_school_hour',
 'cumulative_apple_after_school_hour',
 'cumulative_android_during_school_hour',
 'cumulative_android_after_school_hour',
 'cumulative_unk_during_school_hour',
 'cumulative_unk_after_school_hour',
 'grade_grade 1',
 'grade_grade 2',
 'grade_grade 3',
 'grade_grade 4',
 'grade_grade 5',
 'grade_kindergarten',
 'class_activation_bucket_AC',
 'class_activation_bucket_AC0',
 'class_activation_bucket_AC1',
 'class_activation_bucket_AC2',
 'class_activation_bucket_AC3',
 'class_activation_bucket_unk',
 'last_books.1',
 'last_category_name',
 'last_theme_name',
 'last_reading_skill_name',
 'label']

In [70]:
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,class_activation_bucket_AC0,class_activation_bucket_AC1,class_activation_bucket_AC2,class_activation_bucket_AC3,class_activation_bucket_unk,last_books.1,last_category_name,last_theme_name,last_reading_skill_name,label
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,False,False,True,False,unk,unk,unk,unk,0
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,False,False,True,False,9781634401647,Science & Nature,"The Natural World, Fun Science",unk,1
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,False,False,True,False,97816344016479781039673106,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,False,False,True,False,978163440164797810396731069781039837843,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",0
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,False,False,True,False,"9781634401647,9781039673106,9781039837843,9781...","Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1


In [71]:
child_df['last_categories'] = child_df['last_category_name'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
child_df['category_ids'] = child_df['last_categories'].apply(
    lambda theme_list: [category_to_idx[t] for t in theme_list if t in category_to_idx]
)
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,class_activation_bucket_AC2,class_activation_bucket_AC3,class_activation_bucket_unk,last_books.1,last_category_name,last_theme_name,last_reading_skill_name,label,last_categories,category_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,True,False,unk,unk,unk,unk,0,[unk],[8]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,True,False,9781634401647,Science & Nature,"The Natural World, Fun Science",unk,1,[science & nature],[7]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,True,False,97816344016479781039673106,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning]","[7, 1]"
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,True,False,978163440164797810396731069781039837843,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",0,"[science & nature, early learning, growing up]","[7, 1, 5]"
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,True,False,"9781634401647,9781039673106,9781039837843,9781...","Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning, growing up,...","[7, 1, 5, 6]"


In [74]:
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,class_activation_bucket_unk,last_books.1,last_category_name,last_theme_name,last_reading_skill_name,label,last_categories,category_ids,book_code_ids,last_books_list
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,unk,unk,unk,unk,0,[unk],[8],[12189],[unk]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,9781634401647,Science & Nature,"The Natural World, Fun Science",unk,1,[science & nature],[7],[],[9781634401647]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,97816344016479781039673106,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning]","[7, 1]",[],"[9781634401647, 9781039673106]"
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,978163440164797810396731069781039837843,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",0,"[science & nature, early learning, growing up]","[7, 1, 5]",[],"[9781634401647, 9781039673106, 9781039837843]"
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,"9781634401647,9781039673106,9781039837843,9781...","Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning, growing up,...","[7, 1, 5, 6]",[],"[9781634401647, 9781039673106, 9781039837843, ..."


In [None]:
[9781634401647, 9781039673106]

In [88]:
for i in child_df.last_books_list[2]:
    print(book_code_to_idx[int(i)])

7423
86


In [86]:
book_code_to_idx[9781634401647]

7423

In [89]:
child_df['last_books_list'] = child_df['last_books.1'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)

child_df['book_code_ids'] = child_df['last_books_list'].apply(
    lambda theme_list: [book_code_to_idx[int(t)] if t!='unk' else book_code_to_idx[t]  for t in theme_list ]
)
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,class_activation_bucket_unk,last_books.1,last_category_name,last_theme_name,last_reading_skill_name,label,last_categories,category_ids,book_code_ids,last_books_list
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,unk,unk,unk,unk,0,[unk],[8],[12189],[unk]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,9781634401647,Science & Nature,"The Natural World, Fun Science",unk,1,[science & nature],[7],[7423],[9781634401647]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,97816344016479781039673106,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning]","[7, 1]","[7423, 86]","[9781634401647, 9781039673106]"
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,978163440164797810396731069781039837843,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",0,"[science & nature, early learning, growing up]","[7, 1, 5]","[7423, 86, 4479]","[9781634401647, 9781039673106, 9781039837843]"
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,False,"9781634401647,9781039673106,9781039837843,9781...","Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning, growing up,...","[7, 1, 5, 6]","[7423, 86, 4479, 10908]","[9781634401647, 9781039673106, 9781039837843, ..."


In [91]:
# Step 1: Preprocess themes (split on commas)
child_df['reading_skills'] = child_df['last_reading_skill_name'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
# Step 2: Build theme vocabulary

# Step 3: Map themes to indices
child_df['reading_skill_ids'] = child_df['reading_skills'].apply(
    lambda theme_list: [reading_skill_to_idx[t] for t in theme_list if t in reading_skill_to_idx]
)
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,last_category_name,last_theme_name,last_reading_skill_name,label,last_categories,category_ids,book_code_ids,last_books_list,reading_skills,reading_skill_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,unk,unk,unk,0,[unk],[8],[12189],[unk],[unk],[12]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,Science & Nature,"The Natural World, Fun Science",unk,1,[science & nature],[7],[7423],[9781634401647],[unk],[12]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,"Science & Nature,Early Learning","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning]","[7, 1]","[7423, 86]","[9781634401647, 9781039673106]","[unk, illustrations or other visual elements, ...","[12, 7, 5, 1]"
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,"Science & Nature,Early Learning,Growing Up","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",0,"[science & nature, early learning, growing up]","[7, 1, 5]","[7423, 86, 4479]","[9781634401647, 9781039673106, 9781039837843]","[unk, illustrations or other visual elements, ...","[12, 7, 5, 1, 7, 5, 1]"
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,"Science & Nature,Early Learning,Growing Up,Peo...","The Natural World, Fun Science,Alphabet, Langu...","unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning, growing up,...","[7, 1, 5, 6]","[7423, 86, 4479, 10908]","[9781634401647, 9781039673106, 9781039837843, ...","[unk, illustrations or other visual elements, ...","[12, 7, 5, 1, 7, 5, 1, 11, 7, 6, 5, 3, 1]"


In [93]:
# Step 1: Preprocess themes (split on commas)
child_df['themes'] = child_df['last_theme_name'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)

# Step 3: Map themes to indices
child_df['theme_ids'] = child_df['themes'].apply(
    lambda theme_list: [theme_to_idx[t] for t in theme_list if t in theme_to_idx]
)
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,last_reading_skill_name,label,last_categories,category_ids,book_code_ids,last_books_list,reading_skills,reading_skill_ids,themes,theme_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,unk,0,[unk],[8],[12189],[unk],[unk],[12],[unk],[107]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,unk,1,[science & nature],[7],[7423],[9781634401647],[unk],[12],"[the natural world, fun science]","[98, 41]"
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,"unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning]","[7, 1]","[7423, 86]","[9781634401647, 9781039673106]","[unk, illustrations or other visual elements, ...","[12, 7, 5, 1]","[the natural world, fun science, alphabet, lan...","[98, 41, 2, 57]"
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,"unk,Illustrations or other Visual Elements, De...",0,"[science & nature, early learning, growing up]","[7, 1, 5]","[7423, 86, 4479]","[9781634401647, 9781039673106, 9781039837843]","[unk, illustrations or other visual elements, ...","[12, 7, 5, 1, 7, 5, 1]","[the natural world, fun science, alphabet, lan...","[98, 41, 2, 57, 73, 90]"
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,"unk,Illustrations or other Visual Elements, De...",1,"[science & nature, early learning, growing up,...","[7, 1, 5, 6]","[7423, 86, 4479, 10908]","[9781634401647, 9781039673106, 9781039837843, ...","[unk, illustrations or other visual elements, ...","[12, 7, 5, 1, 7, 5, 1, 11, 7, 6, 5, 3, 1]","[the natural world, fun science, alphabet, lan...","[98, 41, 2, 57, 73, 90, 9]"


In [216]:
# Step 1: Preprocess themes (split on commas)
child_df['countries'] = child_df['country'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
from itertools import chain
all_themes = sorted(set(chain.from_iterable(child_df['countries'])))
country_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in country_to_idx:
    country_to_idx['unk'] = len(country_to_idx)


# Step 3: Map themes to indices
child_df['countries_ids'] = child_df['countries'].apply(
    lambda theme_list: [country_to_idx[t] if t in country_to_idx else country_to_idx['unk'] for t in theme_list]

)
child_df.head()

Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,countries,countries_ids,states,states_ids,zipcodes,zipcode_ids,teacher_ids,teacher_code_ids,school_ids,school_code_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]


In [217]:
# Step 1: Preprocess themes (split on commas)
child_df['states'] = child_df['state'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
from itertools import chain
all_themes = sorted(set(chain.from_iterable(child_df['states'])))
state_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in state_to_idx:
    state_to_idx['unk'] = len(state_to_idx)


# Step 3: Map themes to indices
child_df['states_ids'] = child_df['states'].apply(
    lambda theme_list: [state_to_idx[t] if t in state_to_idx else state_to_idx['unk'] for t in theme_list]
)
child_df.head()


Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,countries,countries_ids,states,states_ids,zipcodes,zipcode_ids,teacher_ids,teacher_code_ids,school_ids,school_code_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]


In [218]:
# Step 1: Preprocess themes (split on commas)
child_df['zipcodes'] = child_df['zipcode'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
from itertools import chain
all_themes = sorted(set(chain.from_iterable(child_df['zipcodes'])))
zipcode_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in zipcode_to_idx:
    zipcode_to_idx['unk'] = len(zipcode_to_idx)


# Step 3: Map themes to indices
child_df['zipcode_ids'] = child_df['zipcodes'].apply(
    lambda theme_list: [zipcode_to_idx[t] if t in zipcode_to_idx else zipcode_to_idx['unk'] for t in theme_list]
)
child_df.head()



Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,countries,countries_ids,states,states_ids,zipcodes,zipcode_ids,teacher_ids,teacher_code_ids,school_ids,school_code_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]


In [219]:
# Step 1: Preprocess themes (split on commas)
child_df['teacher_ids'] = child_df['teacher_id'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
from itertools import chain
all_themes = sorted(set(chain.from_iterable(child_df['teacher_ids'])))
teacher_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in teacher_to_idx:
    teacher_to_idx['unk'] = len(teacher_to_idx)


# Step 3: Map themes to indices
child_df['teacher_code_ids'] = child_df['teacher_ids'].apply(
    lambda theme_list: [teacher_to_idx[t] if t in teacher_to_idx else teacher_to_idx['unk'] for t in theme_list]
)
child_df.head()




Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,countries,countries_ids,states,states_ids,zipcodes,zipcode_ids,teacher_ids,teacher_code_ids,school_ids,school_code_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]


In [220]:
# Step 1: Preprocess themes (split on commas)
child_df['school_ids'] = child_df['school_id'].fillna('').apply(
    lambda x: [t.strip().lower() for t in x.split(',') if t.strip()]
)
from itertools import chain
all_themes = sorted(set(chain.from_iterable(child_df['school_ids'])))
school_to_idx = {theme: idx for idx, theme in enumerate(all_themes)}
if 'unk' not in school_to_idx:
    school_to_idx['unk'] = len(school_to_idx)


# Step 3: Map themes to indices
child_df['school_code_ids'] = child_df['school_ids'].apply(
    lambda theme_list: [school_to_idx[t] if t in school_to_idx else school_to_idx['unk'] for t in theme_list]
)
child_df.head()



Unnamed: 0.1,Unnamed: 0,book_code,user_id,book_create_dt,country,state,zipcode,teacher_id,school_id,cumulative_web_during_school_hour,...,countries,countries_ids,states,states_ids,zipcodes,zipcode_ids,teacher_ids,teacher_code_ids,school_ids,school_code_ids
0,0,9781634401647,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-23 20:25:42.214126,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
1,1,9781039673106,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:01:06.548406,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
2,2,9781039837843,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:02:48.414486,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
3,3,9781634409636,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:04:49.676354,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]
4,4,9781634409759,00000ac6-1a89-415c-8d67-177e17aa1aae,2024-09-24 20:05:17.458130,US,CA,93725,DAF7C9F038C84DE3ACD47E581663CDFA,2707F78AAF23426A819369FFA8E0A5D8,1.0,...,[us],[147],[ca],[59],[93725],[13839],[daf7c9f038c84de3acd47e581663cdfa],[96910],[2707f78aaf23426a819369ffa8e0a5d8],[13621]


In [98]:
item_df_index["theme_ids"][9781427162748]

[52, 34, 91]

In [99]:
import torch
torch.tensor(item_df_index["theme_ids"][9781427162748], dtype=torch.long)

tensor([52, 34, 91])

In [225]:
from torch.utils.data import Dataset
import torch
import numpy as np

class BookInteractionDataset(Dataset):
    def __init__(self, interactions_df, book_features_df, book_feature_cols, interaction_feature_cols):
        """
        interactions_df: includes user_id, book_code, label, and interaction-level features
        book_features_df: indexed by book_code, contains theme_ids, category_ids, and other features
        book_feature_cols: list of book feature column names
        interaction_feature_cols: list of interaction-level feature column names
        """
        self.interactions_df = interactions_df.reset_index(drop=True)
        self.book_features_df = book_features_df.set_index("book_code")
        self.book_feature_cols = book_feature_cols
        self.interaction_feature_cols = interaction_feature_cols

    def __len__(self):
        return len(self.interactions_df)

    def __getitem__(self, idx):
        row = self.interactions_df.iloc[idx]
        book_code = row["book_code"]

        # --- 1. Get book-level features ---
        book_info = self.book_features_df.loc[book_code]

        theme_ids = book_info["theme_ids"]  # already list[int]
        category_ids = book_info["category_ids"]  # already list[int]
        reading_skill_ids = book_info["reading_skill_ids"]  # already list[int]
        grades_ids = book_info['grades_ids']  # already list[int]
        book_code_ids= book_info['book_code_ids']  # already list[int]

        last_book_ids = row['book_code_ids']
        last_theme_ids = row['theme_ids']
        last_category_ids = row['category_ids']
        last_reading_skills_id = row['reading_skill_ids']

        countries_ids = row['countries_ids']
        states_ids = row['states_ids']
        zipcode_ids = row['zipcode_ids']
        teacher_code_ids = row['teacher_code_ids']
        school_code_ids = row['school_code_ids']

        'countries_ids','states_ids','zipcode_ids','teacher_ids','school_ids'
        

        book_features = np.array(book_info[self.book_feature_cols], dtype=np.float32)

        # --- 2. Get interaction-level features ---
        user_features = np.array(row[self.interaction_feature_cols], dtype=np.float32)

        # # --- 3. Merge into one "other_features" vector ---
        # other_features = torch.tensor(
        #     np.concatenate([book_features, interaction_features]),
        #     dtype=torch.float32
        # )

        return {
            "book_code": book_code,
            "theme_ids": torch.tensor(theme_ids, dtype=torch.long),
            "category_ids": torch.tensor(category_ids, dtype=torch.long),
            "reading_skill_ids":torch.tensor(reading_skill_ids, dtype=torch.long),
            "grades_ids" : torch.tensor(grades_ids , dtype=torch.long) ,
            "book_code_ids": torch.tensor(book_code_ids , dtype=torch.long),

            "last_book_ids" : torch.tensor(last_book_ids , dtype=torch.long),
            "last_theme_ids" : torch.tensor(last_theme_ids , dtype=torch.long),
            "last_category_ids" : torch.tensor(last_category_ids , dtype=torch.long),
            "last_reading_skills_id" : torch.tensor(last_reading_skills_id , dtype=torch.long),

            "countries_ids" : torch.tensor(countries_ids , dtype=torch.long),
            "states_ids" : torch.tensor(states_ids, dtype=torch.long),
            "zipcode_ids" : torch.tensor(zipcode_ids , dtype=torch.long),
            "teacher_code_ids" : torch.tensor(teacher_code_ids , dtype=torch.long),
            "school_code_ids" : torch.tensor(school_code_ids , dtype=torch.long),

            "book_features": torch.tensor(book_features, dtype=torch.float32),
            "user_features": torch.tensor(user_features, dtype=torch.float32),
            "label": torch.tensor(row["label"], dtype=torch.float32)
        }


In [211]:
# from torch.nn.utils.rnn import pad_sequence

# def book_collate_fn(batch):

#     theme_ids = [item["theme_ids"] for item in batch]
#     category_ids = [item["category_ids"] for item in batch]

#     reading_skill_ids = [item["reading_skill_ids"]  for item in batch]
#     grades_ids = [item["grades_ids"]  for item in batch]
#     book_code_ids= [item["book_code_ids"] for item in batch]

#     last_book_ids = [item["last_book_ids"] for item in batch]
#     last_theme_ids = [item["last_theme_ids"] for item in batch]
#     last_category_ids = [item["last_category_ids"] for item in batch]
#     last_reading_skills_id = [item["last_reading_skills_id"] for item in batch]


#     book_features = torch.stack([item["book_features"] for item in batch])
#     user_features = torch.stack([item["user_features"] for item in batch])
#     labels = torch.stack([item["label"] for item in batch])

#     return {
#         "theme_ids": theme_ids,
#         "category_ids": category_ids,
#         "reading_skill_ids" :reading_skill_ids,
#         "grades_ids" :grades_ids,
#         "book_code_ids" : book_code_ids,

#         "last_book_ids" : last_book_ids,
#         "last_theme_ids" : last_theme_ids,
#         "last_category_ids" : last_category_ids,
#         "last_reading_skills_id" :last_reading_skills_id,
#         "book_features": book_features,
#         "user_features": user_features,
#         "labels": labels

#     }


In [226]:
from torch.nn.utils.rnn import pad_sequence
import torch

def book_collate_fn(batch):
    # --------- Helper to pad & mask any list-of-tensors field ----------
    def pad_and_mask(key):
        seqs = [torch.as_tensor(item[key], dtype=torch.long) for item in batch]
        
        padded = pad_sequence(seqs, batch_first=True, padding_value=0)  # [B, max_len]
        mask = (padded != 0).long()  # [B, max_len] boolean mask
        return padded, mask

    # Book-level multi-ID fields
    theme_ids, theme_mask = pad_and_mask("theme_ids")
    category_ids, category_mask = pad_and_mask("category_ids")
    reading_skill_ids, reading_skill_mask = pad_and_mask("reading_skill_ids")
    grades_ids, grades_mask = pad_and_mask("grades_ids")
    book_code_ids, book_code_mask = pad_and_mask("book_code_ids")

    # Interaction-level multi-ID fields
    last_book_ids, last_book_mask = pad_and_mask("last_book_ids")
    last_theme_ids, last_theme_mask = pad_and_mask("last_theme_ids")
    last_category_ids, last_category_mask = pad_and_mask("last_category_ids")
    last_reading_skills_id, last_reading_skills_mask = pad_and_mask("last_reading_skills_id")

    

    countries_ids, countries_mask = pad_and_mask("countries_ids")
    states_ids, states_mask = pad_and_mask("states_ids")
    zipcode_ids, zipcode_mask = pad_and_mask( "zipcode_ids")
    teacher_ids, teacher_mask = pad_and_mask("teacher_code_ids")
    school_ids, school_mask = pad_and_mask("school_code_ids")

    # Scalar / dense features
    book_features = torch.stack([torch.as_tensor(item["book_features"], dtype=torch.float32) for item in batch])
    user_features = torch.stack([torch.as_tensor(item["user_features"], dtype=torch.float32) for item in batch])
    labels = torch.stack([torch.as_tensor(item["label"], dtype=torch.float32) for item in batch])

    return {
        # --- Book-level IDs ---
        "theme_ids": theme_ids, "theme_mask": theme_mask,
        "category_ids": category_ids, "category_mask": category_mask,
        "reading_skill_ids": reading_skill_ids, "reading_skill_mask": reading_skill_mask,
        "grades_ids": grades_ids, "grades_mask": grades_mask,
        "book_code_ids": book_code_ids, "book_code_mask": book_code_mask,

        # --- Interaction-level IDs ---
        "last_book_ids": last_book_ids, "last_book_mask": last_book_mask,
        "last_theme_ids": last_theme_ids, "last_theme_mask": last_theme_mask,
        "last_category_ids": last_category_ids, "last_category_mask": last_category_mask,
        "last_reading_skills_id": last_reading_skills_id, "last_reading_skills_mask": last_reading_skills_mask,

        "countries_ids": countries_ids, "countries_mask": countries_mask,
        "states_ids": states_ids, "states_mask": states_mask,
        "zipcode_ids": zipcode_ids, "zipcode_mask": zipcode_mask,
        "teacher_ids": teacher_ids, "teacher_mask": teacher_mask,
        "school_ids": school_ids, "school_mask": school_mask,

        # --- Dense features & labels ---
       
        "book_features": book_features,
        "user_features": user_features,
        "labels": labels
    }


In [145]:
# def book_collate_fn(batch):
#     return {
#         'user_id': [x['user_id'] for x in batch],
#         'book_code': [x['book_code'] for x in batch],
#         'theme_ids': [x['theme_ids'] for x in batch],
#         'category_ids': [x['category_ids'] for x in batch],
#         'other_features': torch.stack([x['other_features'] for x in batch]),
#         'labels': torch.stack([x['label'] for x in batch])
#     }


In [227]:
from torch.utils.data import DataLoader

dataset = BookInteractionDataset(child_df[:16], item_df, book_feature_cols, interaction_feature_cols)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=book_collate_fn)


In [228]:
for batch in dataloader:
    print(len(batch))
    print(batch.keys())
    break

31
dict_keys(['theme_ids', 'theme_mask', 'category_ids', 'category_mask', 'reading_skill_ids', 'reading_skill_mask', 'grades_ids', 'grades_mask', 'book_code_ids', 'book_code_mask', 'last_book_ids', 'last_book_mask', 'last_theme_ids', 'last_theme_mask', 'last_category_ids', 'last_category_mask', 'last_reading_skills_id', 'last_reading_skills_mask', 'countries_ids', 'countries_mask', 'states_ids', 'states_mask', 'zipcode_ids', 'zipcode_mask', 'teacher_ids', 'teacher_mask', 'school_ids', 'school_mask', 'book_features', 'user_features', 'labels'])


In [174]:
(cat(batch['category_ids']) * batch['category_mask'].unsqueeze(-1))

tensor([[[-0.5784, -0.2894, -0.8126, -0.8563],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[-1.3898,  0.9079,  1.1671,  0.8939],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.9718, -1.5750, -0.1490, -0.5427],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.9718, -1.5750, -0.1490, -0.5427],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 2.2753, -0.4973, -0.0440, -1.4458],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 2.2753, -0.4973, -0.0440, -1.4458],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 2.2753, -0.4973, -0.0440, -1.4458],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 2.2753, -0.4973, -0.0440, -1.4458],
         [-0.5784, -0.2894, -0.8126, -0.8563]]], grad_fn=<MulBackward0>)

In [169]:
cat = nn.Embedding(8, 4, padding_idx=0)

summed = (cat(batch['category_ids']) * batch['category_mask'].unsqueeze(-1)).sum(dim=1)
lengths = batch['category_mask'].sum(dim=1).clamp(min=1).unsqueeze(-1)
summed / lengths

tensor([[-0.5784, -0.2894, -0.8126, -0.8563],
        [-1.3898,  0.9079,  1.1671,  0.8939],
        [-0.9718, -1.5750, -0.1490, -0.5427],
        [-0.9718, -1.5750, -0.1490, -0.5427],
        [ 2.2753, -0.4973, -0.0440, -1.4458],
        [ 2.2753, -0.4973, -0.0440, -1.4458],
        [ 2.2753, -0.4973, -0.0440, -1.4458],
        [ 0.8485, -0.3933, -0.4283, -1.1511]], grad_fn=<DivBackward0>)

In [148]:
torch.stack(batch['category_ids'] )

RuntimeError: stack expects each tensor to be equal size, but got [1] at entry 0 and [2] at entry 7

In [150]:
import torch.nn as nn
cat = nn.Embedding(8, 4)
# self.category_emb(category_ids).mean(dim=1)
cat(batch['category_ids'][6]).mean(dim=0)

tensor([-0.1422, -0.4583,  0.3487, -0.6856], grad_fn=<MeanBackward1>)

In [138]:
cat(batch['category_ids'][6])

tensor([[ 0.2405, -1.4271,  1.1299,  0.6182],
        [-1.0203, -0.5868,  1.1684,  0.3732]], grad_fn=<EmbeddingBackward0>)

-0.3899

In [None]:
# import torch
# import torch.nn as nn

# class BookTower(nn.Module):
#     def __init__(self, 
#                  num_themes, num_categories, num_reading_skills, num_grades, num_books, 
#                  embedding_dim, book_feature_dim):
#         super().__init__()
        
#         # Book ID embeddings
#         self.theme_emb = nn.Embedding(num_themes, embedding_dim)
#         self.category_emb = nn.Embedding(num_categories, embedding_dim)
#         self.reading_skill_emb = nn.Embedding(num_reading_skills, embedding_dim)
#         self.grades_emb = nn.Embedding(num_grades, embedding_dim)
#         self.book_code_emb = nn.Embedding(num_books, embedding_dim)
        
#         # Projection layer
#         self.fc = nn.Sequential(
#             nn.Linear(embedding_dim * 5 + book_feature_dim, 128),
#             nn.ReLU(),
#             nn.Linear(128, embedding_dim)
#         )
    
#     def forward(self, theme_ids, category_ids, reading_skill_ids, grades_ids, book_code_ids, book_features):
#         # Mean-pool if sequences
#         theme_vec = self.theme_emb(theme_ids).mean(dim=1)
#         category_vec = self.category_emb(category_ids).mean(dim=1)
#         reading_skill_vec = self.reading_skill_emb(reading_skill_ids).mean(dim=1)
#         grades_vec = self.grades_emb(grades_ids).mean(dim=1)
#         book_code_vec = self.book_code_emb(book_code_ids).mean(dim=1)
        
#         x = torch.cat([theme_vec, category_vec, reading_skill_vec, grades_vec, book_code_vec, book_features], dim=1)
#         return self.fc(x)


# class UserTower(nn.Module):
#     def __init__(self, num_books, num_themes, num_categories, num_reading_skills, embedding_dim, user_feature_dim):
#         super().__init__()
        
#         # Interaction history embeddings
#         self.last_book_emb = nn.Embedding(num_books, embedding_dim)
#         self.last_theme_emb = nn.Embedding(num_themes, embedding_dim)
#         self.last_category_emb = nn.Embedding(num_categories, embedding_dim)
#         self.last_reading_skill_emb = nn.Embedding(num_reading_skills, embedding_dim)
        
#         self.fc = nn.Sequential(
#             nn.Linear(embedding_dim * 4 + user_feature_dim, 128),
#             nn.ReLU(),
#             nn.Linear(128, embedding_dim)
#         )
    
#     def forward(self, last_book_ids, last_theme_ids, last_category_ids, last_reading_skills_id, user_features):
#         last_book_vec = self.last_book_emb(last_book_ids).mean(dim=1)
#         last_theme_vec = self.last_theme_emb(last_theme_ids).mean(dim=1)
#         last_category_vec = self.last_category_emb(last_category_ids).mean(dim=1)
#         last_reading_skill_vec = self.last_reading_skill_emb(last_reading_skills_id).mean(dim=1)
        
#         x = torch.cat([last_book_vec, last_theme_vec, last_category_vec, last_reading_skill_vec, user_features], dim=1)
#         return self.fc(x)


# class TwoTowerModel(nn.Module):
#     def __init__(self, 
#                  num_themes, num_categories, num_reading_skills, num_grades, num_books, 
#                  embedding_dim, book_feature_dim, user_feature_dim):
#         super().__init__()
        
#         self.book_tower = BookTower(num_themes, num_categories, num_reading_skills, num_grades, num_books, embedding_dim, book_feature_dim)
#         self.user_tower = UserTower(num_books, num_themes, num_categories, num_reading_skills, embedding_dim, user_feature_dim)
    
#     def forward(self, batch):
#         book_vec = self.book_tower(
#             batch["theme_ids"], batch["category_ids"], batch["reading_skill_ids"], 
#             batch["grades_ids"], batch["book_code_ids"], batch["book_features"]
#         )
        
#         user_vec = self.user_tower(
#             batch["last_book_ids"], batch["last_theme_ids"], batch["last_category_ids"], 
#             batch["last_reading_skills_id"], batch["user_features"]
#         )
        
#         # Similarity score
#         scores = (user_vec * book_vec).sum(dim=1)
#         return scores


In [191]:
book_feature_count =  {'themes_count':len(theme_to_idx), 
                   'book_count': len(book_code_to_idx), 
                   'grade_count': len(grades_to_idx),
                   'reading_skills_count':len(reading_skill_to_idx),
                   'category_count':len(category_to_idx)
                   }
emb_count = {
                'themes_count':8, 
                'book_count': 16, 
                'grade_count': 4,
                'reading_skills_count':4,
                'category_count':4

}

In [187]:
user_feature_count =  {'themes_count':len(theme_to_idx), 
                   'book_count': len(book_code_to_idx), 
                   'reading_skills_count':len(reading_skill_to_idx),
                   'category_count':len(category_to_idx),
                   'country_count': len(country_to_idx) , 
                    'state_count': len(state_to_idx),
                    'zipcode_count': len(zipcode_to_idx),
                    'teacher_count': len(teacher_to_idx),
                    'school_count': len(school_to_idx)
                   }

user_feature_count

{'themes_count': 108,
 'book_count': 12190,
 'reading_skills_count': 13,
 'category_count': 9,
 'country_count': 159,
 'state_count': 562,
 'zipcode_count': 16065,
 'teacher_count': 113348,
 'school_count': 85547}

In [190]:
user_feature_count =  {'themes_count':len(theme_to_idx), 
                   'book_count': len(book_code_to_idx), 
                   'reading_skills_count':len(reading_skill_to_idx),
                   'category_count':len(category_to_idx),
                   'country_count': len(country_to_idx) , 
                    'state_count': len(state_to_idx),
                    'zipcode_count': len(zipcode_to_idx),
                    'teacher_count': len(teacher_to_idx),
                    'school_count': len(school_to_idx)
                   }
user_emb_count = {
                'themes_count':8, 
                'book_count': 16, 
                'reading_skills_count':4,
                'category_count':4,
                'country_count': 8 , 
                'state_count': 10,
                'zipcode_count': 14,
                'teacher_count': 16,
                'school_count': 16

}


In [177]:
book_feature_count

{'themes_count': 108,
 'book_count': 12190,
 'grade_count': 11,
 'reading_skills_count': 13,
 'category_count': 9}

In [243]:
import torch
import torch.nn as nn

def masked_mean(embeddings, mask):
    """
    embeddings: [B, L, D]
    mask: [B, L] (1 where valid, 0 where padded)
    Returns: [B, D]
    """
    summed = (embeddings * mask.unsqueeze(-1)).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1).unsqueeze(-1)
    return summed / counts


class BookTower(nn.Module):
    def __init__(self, book_feature_count,emb_count, book_feature_dim):
        super().__init__()
        
        # Book ID embeddings
        self.theme_emb = nn.Embedding(book_feature_count['themes_count'], emb_count['themes_count'] , padding_idx=0)
        self.category_emb = nn.Embedding(book_feature_count['category_count'], emb_count['category_count'] , padding_idx=0)
        self.reading_skill_emb = nn.Embedding(book_feature_count['reading_skills_count'], emb_count['reading_skills_count'] , padding_idx=0)
        self.grades_emb = nn.Embedding(book_feature_count['grade_count'], emb_count['grade_count'] , padding_idx=0)
        self.book_code_emb = nn.Embedding(book_feature_count['book_count'], emb_count['book_count'] , padding_idx=0)
        
        self.embedding_dim = emb_count['themes_count'] + emb_count['category_count'] + emb_count['reading_skills_count'] + emb_count['grade_count'] + emb_count['book_count']
        # Projection layer
        self.fc = nn.Sequential(
            nn.Linear(self.embedding_dim  + book_feature_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
    
    def forward(self, theme_ids, theme_mask, 
                category_ids, category_mask, 
                reading_skill_ids, reading_skill_mask, 
                grades_ids, grades_mask, 
                book_code_ids, book_code_mask, 
                book_features):
        
        theme_vec = masked_mean(self.theme_emb(theme_ids), theme_mask)
        category_vec = masked_mean(self.category_emb(category_ids), category_mask)
        reading_skill_vec = masked_mean(self.reading_skill_emb(reading_skill_ids), reading_skill_mask)
        grades_vec = masked_mean(self.grades_emb(grades_ids), grades_mask)
        book_code_vec = masked_mean(self.book_code_emb(book_code_ids), book_code_mask)
        
        x = torch.cat([theme_vec, category_vec, reading_skill_vec, grades_vec, book_code_vec, book_features], dim=1)
        return self.fc(x)


class UserTower(nn.Module):
    def __init__(self, user_feature_count, user_emb_count, user_feature_dim):
        super().__init__()

        # All categorical features as embeddings
        self.last_book_emb = nn.Embedding(user_feature_count['book_count'], user_emb_count['book_count'], padding_idx=0)
        self.last_theme_emb = nn.Embedding(user_feature_count['themes_count'], user_emb_count['themes_count'], padding_idx=0)
        self.last_category_emb = nn.Embedding(user_feature_count['category_count'], user_emb_count['category_count'], padding_idx=0)
        self.last_reading_skill_emb = nn.Embedding(user_feature_count['reading_skills_count'], emb_count['reading_skills_count'], padding_idx=0)

        self.country_emb = nn.Embedding(user_feature_count['country_count'], user_emb_count['country_count'], padding_idx=0)
        self.state_emb = nn.Embedding(user_feature_count['state_count'], user_emb_count['state_count'], padding_idx=0)
        self.zipcode_emb = nn.Embedding(user_feature_count['zipcode_count'], user_emb_count['zipcode_count'], padding_idx=0)
        self.teacher_emb = nn.Embedding(user_feature_count['teacher_count'], user_emb_count['teacher_count'], padding_idx=0)
        self.school_emb = nn.Embedding(user_feature_count['school_count'], user_emb_count['school_count'], padding_idx=0)

        # Compute total embedding dim dynamically
        total_emb_dim = sum(user_emb_count.values())

        self.fc = nn.Sequential(
            nn.Linear(total_emb_dim + user_feature_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )

    def forward(self,
                last_book_ids, last_book_mask,
                last_theme_ids, last_theme_mask,
                last_category_ids, last_category_mask,
                last_reading_skills_id, last_reading_skills_mask,
                country_ids, country_mask,
                state_ids, state_mask,
                zipcode_ids, zipcode_mask,
                teacher_ids, teacher_mask,
                school_ids, school_mask,
                user_features):

        # Apply masked mean pooling to all categorical features
        last_book_vec = masked_mean(self.last_book_emb(last_book_ids), last_book_mask)
        last_theme_vec = masked_mean(self.last_theme_emb(last_theme_ids), last_theme_mask)
        last_category_vec = masked_mean(self.last_category_emb(last_category_ids), last_category_mask)
        last_reading_skill_vec = masked_mean(self.last_reading_skill_emb(last_reading_skills_id), last_reading_skills_mask)

        country_vec = masked_mean(self.country_emb(country_ids), country_mask)
        state_vec = masked_mean(self.state_emb(state_ids), state_mask)
        zipcode_vec = masked_mean(self.zipcode_emb(zipcode_ids), zipcode_mask)
        teacher_vec = masked_mean(self.teacher_emb(teacher_ids), teacher_mask)
        school_vec = masked_mean(self.school_emb(school_ids), school_mask)

        x = torch.cat([
            last_book_vec, last_theme_vec, last_category_vec, last_reading_skill_vec,
            country_vec, state_vec, zipcode_vec, teacher_vec, school_vec,
            user_features
        ], dim=1)

        return self.fc(x)


class TwoTowerModel(nn.Module):
    def __init__(self,
                 book_feature_count, user_feature_count,
                 emb_count, user_emb_count, book_feature_dim, user_feature_dim):
        super().__init__()

        self.book_tower = BookTower(book_feature_count, emb_count, book_feature_dim)
        self.user_tower = UserTower(user_feature_count, user_emb_count, user_feature_dim)

    def forward(self, batch):
        book_vec = self.book_tower(
            batch["theme_ids"], batch["theme_mask"],
            batch["category_ids"], batch["category_mask"],
            batch["reading_skill_ids"], batch["reading_skill_mask"],
            batch["grades_ids"], batch["grades_mask"],
            batch["book_code_ids"], batch["book_code_mask"],
            batch["book_features"]
        )

        user_vec = self.user_tower(
            batch["last_book_ids"], batch["last_book_mask"],
            batch["last_theme_ids"], batch["last_theme_mask"],
            batch["last_category_ids"], batch["last_category_mask"],
            batch["last_reading_skills_id"], batch["last_reading_skills_mask"],
            batch["countries_ids"], batch["countries_mask"],
            batch["states_ids"], batch["states_mask"],
            batch["zipcode_ids"], batch["zipcode_mask"],
            batch["teacher_ids"], batch["teacher_mask"],
            batch["school_ids"], batch["school_mask"],
            batch["user_features"]
        )

        # Dot product similarity
        scores = (user_vec * book_vec).sum(dim=1)
        return scores


In [265]:
for batch in dataloader:
    theme_ids = batch['theme_ids']
    category_ids = batch['category_ids']
    # other_features = batch['other_features']
    labels = batch['labels']

    # book_emb = book_model(theme_ids, category_ids, other_features)

    # If using two-tower: get user_emb from user_tower
    # dot_product = (book_emb * user_emb).sum(dim=1)

    # Then: compute loss (e.g., BCEWithLogitsLoss), backward, optimizer.step()


In [267]:
batch.keys()

dict_keys(['theme_ids', 'theme_mask', 'category_ids', 'category_mask', 'reading_skill_ids', 'reading_skill_mask', 'grades_ids', 'grades_mask', 'book_code_ids', 'book_code_mask', 'last_book_ids', 'last_book_mask', 'last_theme_ids', 'last_theme_mask', 'last_category_ids', 'last_category_mask', 'last_reading_skills_id', 'last_reading_skills_mask', 'countries_ids', 'countries_mask', 'states_ids', 'states_mask', 'zipcode_ids', 'zipcode_mask', 'teacher_ids', 'teacher_mask', 'school_ids', 'school_mask', 'book_features', 'user_features', 'labels'])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import copy
from tqdm import tqdm
import numpy as np


def recall_at_k(pred_ranks, k):
    """ pred_ranks: list of ranks for the true positive item (1 = best) """
    return np.mean([1 if r <= k else 0 for r in pred_ranks])


def ndcg_at_k(pred_ranks, k):
    """ NDCG@K: Normalized Discounted Cumulative Gain """
    return np.mean([1 / np.log2(r + 1) if r <= k else 0 for r in pred_ranks])


def train_two_tower(model,
                    train_loader,
                    val_loader,
                    epochs=10,
                    lr=1e-3,
                    weight_decay=1e-5,
                    patience=3,
                    device="cuda" if torch.cuda.is_available() else "cpu",
                    checkpoint_dir="checkpoints",
                    checkpoint_name="two_tower_best.pt",
                    eval_k_list=[5, 10]):

    os.makedirs(checkpoint_dir, exist_ok=True)

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss()

    best_val_loss = float("inf")
    best_model_wts = copy.deepcopy(model.state_dict())
    patience_counter = 0

    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        print("-" * 30)

        # ----------------
        # Train
        # ----------------
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_loader, desc="Training"):
            batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(batch)
            loss = criterion(outputs, batch["labels"].float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch["labels"].size(0)
        train_loss /= len(train_loader.dataset)

        # ----------------
        # Validation
        # ----------------
        model.eval()
        val_loss = 0.0
        user_vecs = []
        book_vecs = []
        labels_list = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
                outputs = model(batch)
                loss = criterion(outputs, batch["labels"].float())
                val_loss += loss.item() * batch["labels"].size(0)

                # Store embeddings for retrieval metrics
                u_vec = model.user_tower(
                    batch["last_book_ids"], batch["last_book_mask"],
                    batch["last_theme_ids"], batch["last_theme_mask"],
                    batch["last_category_ids"], batch["last_category_mask"],
                    batch["last_reading_skills_id"], batch["last_reading_skills_mask"],
                    batch["user_features"]
                )
                b_vec = model.book_tower(
                    batch["last_book_ids"], batch["last_book_mask"],
                    batch["last_theme_ids"], batch["last_theme_mask"],
                    batch["last_category_ids"], batch["last_category_mask"],
                    batch["last_reading_skills_id"], batch["last_reading_skills_mask"],
                    batch["countries_ids"], batch["countries_mask"],
                    batch["states_ids"], batch["states_mask"],
                    batch["zipcode_ids"], batch["zipcode_mask"],
                    batch["teacher_ids"], batch["teacher_mask"],
                    batch["school_ids"], batch["school_mask"],
                    batch["user_features"]
                )
                user_vecs.append(u_vec.cpu())
                book_vecs.append(b_vec.cpu())
                labels_list.append(batch["labels"].cpu())

        val_loss /= len(val_loader.dataset)

        # Retrieval metrics
        user_vecs = torch.cat(user_vecs)
        book_vecs = torch.cat(book_vecs)
        labels_list = torch.cat(labels_list)

        sim_matrix = torch.matmul(user_vecs, book_vecs.T)  # [num_users, num_books]

        pred_ranks = []
        for i in range(sim_matrix.size(0)):
            # Higher score = better rank
            scores = sim_matrix[i]
            true_idx = (labels_list[i] == 1).nonzero(as_tuple=True)[0]
            if len(true_idx) == 0:
                continue
            sorted_indices = torch.argsort(scores, descending=True)
            rank = (sorted_indices == true_idx[0]).nonzero(as_tuple=True)[0].item() + 1
            pred_ranks.append(rank)

        metrics = {}
        for k in eval_k_list:
            metrics[f"Recall@{k}"] = recall_at_k(pred_ranks, k)
            metrics[f"NDCG@{k}"] = ndcg_at_k(pred_ranks, k)

        # Logging
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        for k in eval_k_list:
            print(f"Recall@{k}: {metrics[f'Recall@{k}']:.4f} | NDCG@{k}: {metrics[f'NDCG@{k}']:.4f}")

        # ----------------
        # Early stopping & checkpoint
        # ----------------
        if val_loss < best_val_loss:
            print(f"✅ Validation loss improved from {best_val_loss:.4f} → {val_loss:.4f}. Saving model...")
            best_val_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(best_model_wts, os.path.join(checkpoint_dir, checkpoint_name))
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"⏳ No improvement. Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("⛔ Early stopping triggered.")
                break

    # Load best weights
    model.load_state_dict(best_model_wts)
    return model


In [260]:
for i  in trainloader:
    print(i.keys())
    break

dict_keys(['theme_ids', 'theme_mask', 'category_ids', 'category_mask', 'reading_skill_ids', 'reading_skill_mask', 'grades_ids', 'grades_mask', 'book_code_ids', 'book_code_mask', 'last_book_ids', 'last_book_mask', 'last_theme_ids', 'last_theme_mask', 'last_category_ids', 'last_category_mask', 'last_reading_skills_id', 'last_reading_skills_mask', 'countries_ids', 'countries_mask', 'states_ids', 'states_mask', 'zipcode_ids', 'zipcode_mask', 'teacher_ids', 'teacher_mask', 'school_ids', 'school_mask', 'book_features', 'user_features', 'labels'])


In [261]:
from torch.utils.data import DataLoader

dataset = BookInteractionDataset(child_df[:16], item_df, book_feature_cols, interaction_feature_cols)
trainloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=book_collate_fn)

dataset = BookInteractionDataset(child_df[23:41], item_df, book_feature_cols, interaction_feature_cols)
valloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=book_collate_fn)

In [262]:
model = TwoTowerModel(book_feature_count, user_feature_count,
                 emb_count, user_emb_count,
                  book_feature_dim=len(book_feature_cols), user_feature_dim=len(interaction_feature_cols))

In [263]:
model = train_two_tower(
    model=model,
    train_loader=trainloader,      # Your PyTorch DataLoader for training
    val_loader=valloader,          # Your PyTorch DataLoader for validation
    epochs=2,                      # Number of epochs
    lr=1e-3,                        # Learning rate
    weight_decay=1e-5,              # L2 regularization
    patience=5,                     # Early stopping patience                  # "cuda" or "cpu"
    checkpoint_dir="./checkpoints", # Where to save the best model
    checkpoint_name="two_tower_best.pt",
    eval_k_list=[3, 5]              # Compute Recall@5, Recall@10, NDCG@5, NDCG@10
)


Epoch 1/2
------------------------------


Training: 100%|██████████| 2/2 [00:00<00:00, 16.61it/s]
Validation:   0%|          | 0/3 [00:00<?, ?it/s]


TypeError: UserTower.forward() missing 10 required positional arguments: 'country_mask', 'state_ids', 'state_mask', 'zipcode_ids', 'zipcode_mask', 'teacher_ids', 'teacher_mask', 'school_ids', 'school_mask', and 'user_features'