In [36]:
import pandas as pd
import joblib 
import os
from scipy.sparse import coo_matrix, save_npz

##### Define reusable functions

In [37]:
def save_artifact(artifact, artifact_name, artifact_dir='../recommender_app/artifacts/'):
    # Create the model directory if it doesn't exist
    os.makedirs(artifact_dir, exist_ok=True)
    # Save the model
    artifact_path = os.path.join(artifact_dir, f"{artifact_name}.pkl")
    joblib.dump(artifact, artifact_path)
    print(f"Artifact saved to {artifact_path}")

##### Load <code>interactions.csv</code>

In [38]:
df_interactions = pd.read_csv("../data/cleaned_data/interactions.csv")
df_interactions.head()

Unnamed: 0,user_id,context_device_type,story_id,user_duration,user_preferred_language,user_region,lang_id
0,76bbb56c-b657-4973-ad9a-9cdcff38a2ef,android,9639,61.0,nl,NL,2
1,8af25ada-d0be-4edf-a41f-ddd4dae82996,android,6715,63.411765,de,DE,5
2,8af25ada-d0be-4edf-a41f-ddd4dae82996,android,6721,54.0,de,DE,5
3,76bbb56c-b657-4973-ad9a-9cdcff38a2ef,android,5344,49.176471,nl,NL,2
4,76bbb56c-b657-4973-ad9a-9cdcff38a2ef,android,5343,13.0,nl,NL,2


##### Load <code>story_categories_v1.csv</code>

In [39]:
df_story_categories = pd.read_csv("../data/cleaned_data/story_categories_v1.csv")
df_story_categories.head()

Unnamed: 0,story_id,generic_category_id,basic_category_id,difficulty_id
0,5,2,,
1,6,2,,
2,7,2,,
3,8,2,,
4,9,2,,


In [40]:
# Merge interactions dataframe with story categories dataframe
df_interactions = df_interactions.merge(df_story_categories[['story_id', 'generic_category_id']], on='story_id', how='left')

# Exclude info stories, since they will not be part of the recommendation system
df_interactions = df_interactions[df_interactions['generic_category_id'] != 0]

# Drop unnecessary column
df_interactions.drop(columns=['generic_category_id'], inplace=True)

In [41]:
df_interactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 350351 entries, 0 to 486120
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  350351 non-null  object 
 1   context_device_type      350351 non-null  object 
 2   story_id                 350351 non-null  int64  
 3   user_duration            350351 non-null  float64
 4   user_preferred_language  350351 non-null  object 
 5   user_region              350351 non-null  object 
 6   lang_id                  350351 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 21.4+ MB


In [42]:
# Get unique user ids
user_ids = sorted(df_interactions['user_id'].unique())

# Create user_id mappings to lightFM internal ids
user_id_mapping = {uid: idx for idx, uid in enumerate(user_ids)}

##### Load <code>tour_stories_v1.csv</code>

In [43]:
df_tour_stories = pd.read_csv("../data/cleaned_data/tour_stories_v1.csv") 
df_tour_stories.head()

Unnamed: 0,story_id,story_title,tour_item_id,tour_item_name,tour_id,tour_name
0,5,Lambros Eftaxias (1915-1996),512,Lambros Eftaxias,1,King Otto's first palace
1,347,His career,512,Lambros Eftaxias,1,King Otto's first palace
2,348,The collector,512,Lambros Eftaxias,1,King Otto's first palace
3,349,Donations to Museums,512,Lambros Eftaxias,1,King Otto's first palace
4,350,The Friends of Music Association (1953),512,Lambros Eftaxias,1,King Otto's first palace


In [44]:
# Merge tour stories dataframe with story categories dataframe 
df_tour_stories = df_tour_stories.merge(df_story_categories, on='story_id', how='left')

# Exclude info stories
df_tour_stories = df_tour_stories[df_tour_stories['generic_category_id'] != 0]

In [45]:
df_tour_stories.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16921 entries, 0 to 19771
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   story_id             16921 non-null  int64  
 1   story_title          16921 non-null  object 
 2   tour_item_id         16921 non-null  int64  
 3   tour_item_name       16921 non-null  object 
 4   tour_id              16921 non-null  int64  
 5   tour_name            16921 non-null  object 
 6   generic_category_id  16921 non-null  int64  
 7   basic_category_id    257 non-null    float64
 8   difficulty_id        5772 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.3+ MB


In [46]:
# Get unique story ids as item_ids
item_ids = sorted(df_tour_stories['story_id'].unique())

# Create item_id mappings to lightFM internal ids
item_id_mapping = {iid: idx for idx, iid in enumerate(item_ids)}

In [47]:
save_artifact(user_id_mapping, "user_id_mapping")
save_artifact(item_id_mapping, "item_id_mapping")

Artifact saved to ../recommender_app/artifacts/user_id_mapping.pkl
Artifact saved to ../recommender_app/artifacts/item_id_mapping.pkl


### Build interactions matrix

##### Load <code>story_lang_content_v1.csv</code>

In [48]:
df_story_lang_content = pd.read_csv("../data/cleaned_data/story_lang_content_v1.csv") 
df_story_lang_content.head()

Unnamed: 0,story_id,lang_id,story_reading_time,track_duration
0,5,1,18.0,
1,6,1,30.0,
2,7,1,31.0,
3,8,1,13.0,
4,9,1,27.0,


In [49]:
# Calculate total listening time per (user, story) pair
df_agg_interactions = df_interactions.groupby(['user_id', 'story_id', 'lang_id'])['user_duration'].sum().reset_index()

df_agg_interactions = df_agg_interactions.merge(df_story_lang_content, on=['story_id', 'lang_id'], how='left')

# Use story audio track duration as story length when available, otherwise use story reading time
df_agg_interactions['story_length'] = df_agg_interactions['track_duration'].combine_first(df_agg_interactions['story_reading_time'])

# Calculate story completion ratio, clipped to [0,1]
df_agg_interactions['completion_ratio'] = (df_agg_interactions['user_duration'] / df_agg_interactions['story_length']).clip(upper=1.0)

# Drop unnecessary columns
df_agg_interactions.drop(columns=['lang_id', 'story_reading_time', 'track_duration', 'user_duration', 'story_length'], inplace=True)

df_agg_interactions.head()


Unnamed: 0,user_id,story_id,completion_ratio
0,00021015-59DD-4051-8B2B-E9599FAF445C,18089,1.0
1,00030D04-7CF1-4235-8A2F-227CDCDAE12F,5345,1.0
2,00030D04-7CF1-4235-8A2F-227CDCDAE12F,5350,0.09434
3,00030D04-7CF1-4235-8A2F-227CDCDAE12F,6713,1.0
4,00030D04-7CF1-4235-8A2F-227CDCDAE12F,6720,1.0


In [50]:
df_agg_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286093 entries, 0 to 286092
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   user_id           286093 non-null  object 
 1   story_id          286093 non-null  int64  
 2   completion_ratio  286093 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.5+ MB


In [51]:
df_agg_interactions['completion_ratio'].describe()

count    286093.000000
mean          0.895094
std           0.254378
min           0.049020
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: completion_ratio, dtype: float64

In [52]:
# Map actual Ids to internal LightFM indices
row = df_agg_interactions['user_id'].map(user_id_mapping) 
col = df_agg_interactions['story_id'].map(item_id_mapping)
data = df_agg_interactions['completion_ratio']

# Create interactions matrix as sparse  COO matrix and convert to csr for efficiency
interactions_matrix = coo_matrix((data, (row, col)), shape=(len(user_id_mapping), len(item_id_mapping))).tocsr()

### Build item features matrix

In [53]:
df_tour_stories.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16921 entries, 0 to 19771
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   story_id             16921 non-null  int64  
 1   story_title          16921 non-null  object 
 2   tour_item_id         16921 non-null  int64  
 3   tour_item_name       16921 non-null  object 
 4   tour_id              16921 non-null  int64  
 5   tour_name            16921 non-null  object 
 6   generic_category_id  16921 non-null  int64  
 7   basic_category_id    257 non-null    float64
 8   difficulty_id        5772 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.3+ MB


#### Load csv files with necessary story metadata

In [55]:
df_generic_category = pd.read_csv("../data/cleaned_data/generic_categories_v1.csv")

In [None]:
df_item_features = df_tour_stories[['story_id', 'tour_id', 'generic_category_id', 'basic_category_id', 'difficulty_id']]

df_item_features = df_item_features.merge(df_generic_category, on='generic_category_id', how='left')



KeyError: "[''] not in index"