In [2]:
import os
os.chdir("..")   
os.getcwd()

'd:\\Courses\\DataScience\\AI_Librarian\\Librarian_Recommender_DSML'

In [23]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm.auto import tqdm
tqdm.pandas()

# --- Load data ---
interactions = pd.read_csv("data/interactions_train.csv")
items = pd.read_csv("data/items.csv")
sample_sub = pd.read_csv("data/sample_submission.csv")

print("Interactions:", interactions.shape)
print("Items:", items.shape)
print("Sample submission:", sample_sub.shape)

display(interactions.head())
display(items.head())
display(sample_sub.head())


Interactions: (87047, 3)
Items: (15291, 6)
Sample submission: (7838, 2)


Unnamed: 0,u,i,t
0,4456,8581,1687541000.0
1,142,1964,1679585000.0
2,362,3705,1706872000.0
3,1809,11317,1673533000.0
4,4384,1323,1681402000.0


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4


Unnamed: 0,user_id,recommendation
0,0,3758 11248 9088 9895 5101 6074 9295 14050 1096...
1,1,3263 726 1589 14911 6432 10897 6484 7961 8249 ...
2,2,13508 9848 12244 2742 11120 2893 2461 5439 116...
3,3,2821 10734 6357 5934 2085 12608 12539 10551 10...
4,4,12425 219 11602 1487 14178 489 13888 2110 4413...


In [24]:
# Make sure text columns have no NaNs
items["Title"] = items["Title"].fillna("")
items["Author"] = items["Author"].fillna("")
items["Subjects"] = items["Subjects"].fillna("")

# Combine metadata into a single text field
items["text"] = (
    items["Title"].astype(str) + " " +
    items["Author"].astype(str) + " " +
    items["Subjects"].astype(str)
)

items[["i", "Title", "Author", "Subjects", "text"]].head()


Unnamed: 0,i,Title,Author,Subjects,text
0,0,Classification décimale universelle : édition ...,,Classification décimale universelle; Indexatio...,Classification décimale universelle : édition ...
1,1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",didactique--langue étrangère - enseignement; d...,Les interactions dans l'enseignement des langu...
2,2,Histoire de vie et recherche biographique : pe...,,Histoires de vie en sociologie; Sciences socia...,Histoire de vie et recherche biographique : pe...
3,3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",Moyen-Orient; Bandes dessinées autobiographiqu...,Ce livre devrait me permettre de résoudre le c...
4,4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",France--1945-1975; Roman historique; Roman fra...,"Les années glorieuses : roman / Lemaitre, Pier..."


In [25]:
# Sort items by their ID just to be safe
items = items.sort_values("i").reset_index(drop=True)

# Map item_id -> position in items dataframe
item_id_to_pos = {item_id: idx for idx, item_id in enumerate(items["i"].values)}

# Add item position to interactions
interactions["item_pos"] = interactions["i"].map(item_id_to_pos)

print("Missing item positions:", interactions["item_pos"].isna().sum())
interactions.head()


Missing item positions: 0


Unnamed: 0,u,i,t,item_pos
0,4456,8581,1687541000.0,8581
1,142,1964,1679585000.0,1964
2,362,3705,1706872000.0,3705
3,1809,11317,1673533000.0,11317
4,4384,1323,1681402000.0,1323


In [26]:
tfidf = TfidfVectorizer(
    stop_words=None,   # texts are mostly French, so no english stopwords
    min_df=2,          # ignore very rare words
    max_df=0.9         # ignore extremely common words
)

tfidf_matrix = tfidf.fit_transform(items["text"])
tfidf_matrix.shape


(15291, 14863)

In [27]:
def get_user_profile(user_id):
    """Return a TF-IDF profile vector for a given user."""
    # positions of the items this user interacted with
    user_item_pos = interactions.loc[interactions["u"] == user_id, "item_pos"].dropna().astype(int).values

    if len(user_item_pos) == 0:
        return None

    # average TF-IDF vector for this user
    user_profile = tfidf_matrix[user_item_pos].mean(axis=0)

    # convert from numpy.matrix to 1D numpy array
    user_profile = np.asarray(user_profile).reshape(1, -1)

    return user_profile



In [28]:
def recommend_tfidf(user_id, top_k=10):
    """
    Recommend top_k items for user_id using TF-IDF content-based filtering.
    Returns a list of item IDs (column 'i' in items.csv).
    """
    user_profile = get_user_profile(user_id)
    
    # Cold start: no interactions in training
    if user_profile is None:
        return []
    
    # Cosine similarity between user profile and all items
    scores = cosine_similarity(np.asarray(user_profile), tfidf_matrix).ravel() # 1 x n_items -> (n_items,)
    
    # Remove items the user already interacted with
    user_item_pos = interactions.loc[interactions["u"] == user_id, "item_pos"].dropna().astype(int).values
    scores[user_item_pos] = -1e9  # very low score so they never appear in top-k
    
    # Get top_k indices
    top_indices = np.argsort(scores)[-top_k:][::-1]  # sort ascending, take last k, reverse
    
    # Convert back to item IDs
    return items.iloc[top_indices]["i"].tolist()


In [29]:
test_user = interactions["u"].sample(1).iloc[0]
print("Test user:", test_user)
recommend_tfidf(test_user)



Test user: 2268


[2551, 11376, 2513, 6892, 4635, 3456, 9913, 12514, 7525, 92]

In [30]:
popular_items = (
    interactions["i"]
    .value_counts()
    .index
    .tolist()
)

popular_items[:10]


[11366, 3055, 10715, 8999, 611, 4426, 53, 2820, 13885, 14555]

In [31]:
submission_rows = []

for user_id in tqdm(sample_sub["user_id"], desc="Building TF-IDF recommendations"):
    recs = recommend_tfidf(user_id, top_k=10)
    
    # Cold start: if no recs, fill with popular items
    if len(recs) < 10:
        # avoid duplicates with recs (optional)
        filled = recs.copy()
        for item_id in popular_items:
            if item_id not in filled:
                filled.append(item_id)
            if len(filled) == 10:
                break
        recs = filled
    
    # Turn into space-separated string
    submission_rows.append(" ".join(map(str, recs)))

submission_tfidf = pd.DataFrame({
    "user_id": sample_sub["user_id"],
    "recommendation": submission_rows
})

submission_tfidf.head()


Building TF-IDF recommendations:   0%|          | 0/7838 [00:00<?, ?it/s]

Unnamed: 0,user_id,recommendation
0,0,4512 13261 8404 1583 6890 3765 9759 6726 5332 ...
1,1,2551 4635 6892 11376 2513 4866 3456 9913 7157 ...
2,2,4432 8587 8050 15067 15013 4426 4635 6892 2513...
3,3,7431 2513 6892 2551 11376 4635 3456 9913 3596 ...
4,4,248 6726 3765 5925 1052 3752 1737 8796 43 1014


In [32]:
submission_tfidf.to_csv("submission_tfidf.csv", index=False)
"Saved submission_tfidf.csv"


'Saved submission_tfidf.csv'

In [21]:
submission_tfidf["recommendation"].apply(lambda x: len(x.split())).value_counts()



recommendation
10    7838
Name: count, dtype: int64

In [22]:
submission_tfidf.isna().sum()


user_id           0
recommendation    0
dtype: int64