# 2026-01-05

In [21]:
"""
Shared utilities for the recommendation system pipeline.
Provides common functions for data loading, logging, and validation.
"""

import logging
import os
import sys
from typing import Optional

import pandas as pd
from pydantic import ConfigDict, validate_call


def setup_logging(stage_name: str, log_file: str, level=logging.INFO):
    """Configure logging for a pipeline stage."""
    os.makedirs(os.path.dirname(log_file), exist_ok=True)

    logging.basicConfig(
        level=level,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        filename=log_file,
        filemode="w",
        encoding="utf-8",
    )
    logger = logging.getLogger(stage_name)

    # Prevent duplicate handlers if main() is called multiple times
    if logger.hasHandlers():
        logger.handlers.clear()

    # File Handler
    file_handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
    file_handler.setFormatter(
        logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
    )
    logger.addHandler(file_handler)

    # Console Handler
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(
        logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
    )

    # Explicitly ensure the stream is flushed after every write
    console_handler.flush = sys.stdout.flush
    logger.addHandler(console_handler)

    return logger


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def safe_read_csv(filepath: str, usecols: Optional[list[str]] = None) -> pd.DataFrame:
    """Safely read CSV file"""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")

    try:
        df = pd.read_csv(filepath).astype(str).fillna("")

        df.columns = df.columns.str.lower()
        if usecols:
            missing_cols = [c for c in usecols if c.lower() not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in input CSV: {missing_cols}")
            return df[usecols]
        return df
    except pd.errors.ParserError as e:
        raise pd.errors.ParserError(f"Error parsing {filepath}: {e}")


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def safe_read_feather(
    filepath: str, usecols: Optional[list[str]] = None
) -> pd.DataFrame:
    """Safely read Feather file"""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")

    try:
        df = pd.read_feather(filepath).astype(str).fillna("")

        df.columns = df.columns.str.lower()
        if usecols:
            missing_cols = [c for c in usecols if c.lower() not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in input Feather: {missing_cols}")
            return df[usecols]
        return df
    except Exception as e:
        raise ValueError(f"Error reading or processing feather file {filepath}: {e}")

In [22]:
"""
Centralized configuration for the recommendation system pipeline.
Defines all paths, hyperparameters, and constants used across stages.
"""

import os
from pathlib import Path

# ============================================================
# Project Structure
# ============================================================
PROJECT_ROOT = Path("/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent")

# Data paths
DATA_DIR = PROJECT_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
CLEAN_DATA_DIR = DATA_DIR / "clean"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
MATRICES_DIR = DATA_DIR / "matrices"
PKL_DIR = DATA_DIR / "pkl"
FACTORS_DIR = DATA_DIR / "factors"
LOGS_DIR = PROJECT_ROOT / "logs"
LOG_FILE = str(LOGS_DIR / "app.log")

# Create directories if they don't exist
for directory in [
    CLEAN_DATA_DIR,
    EMBEDDINGS_DIR,
    MATRICES_DIR,
    PKL_DIR,
    FACTORS_DIR,
    LOGS_DIR,
]:
    os.makedirs(directory, exist_ok=True)

# ============================================================
# Stage 1: Data Preprocessing
# ============================================================
# Input files
INPUT_BOOKS = str(RAW_DATA_DIR / "books_data.csv")
INPUT_RATINGS = str(RAW_DATA_DIR / "books_rating.csv")
INPUT_COLS_BOOKS = ["title", "description", "authors", "infolink", "categories"]
INPUT_COLS_RATINGS = [
    "title",
    "user_id",
    "profilename",
    "review/helpfulness",
    "review/score",
    "review/time",
    "review/summary",
    "review/text",
]

# Output files
OUTPUT_BOOKS = str(CLEAN_DATA_DIR / "cleaned_books_data.ftr")
OUTPUT_COLS_BOOKS = ["book_id", "title", "authors", "description", "genres", "infolink"]
OUTPUT_RATINGS = str(CLEAN_DATA_DIR / "cleaned_ratings_data.ftr")
OUTPUT_COLS_RATINGS = [
    "book_id",
    "user_id",
    "review/score",
    "confidence",
    "datetime",
    "review/summary",
    "review/text",
]

# Configuration
MIN_DESC_LENGTH = 10
TOP_N_GENRES = 50
COMMON_DELIMS = [";", "|", "/", "•"]
MIN_USER_INTERACTIONS = 5
MAX_USER_INTERACTIONS = 500
MIN_BOOK_INTERACTIONS = 5

# ============================================================
# Stage 2: Semantic Search (Embeddings)
# ============================================================
# Output files
OUTPUT_CATALOG_BOOKS_INDEX = str(EMBEDDINGS_DIR / "catalog_books.index")
OUTPUT_CATALOG_BOOKS_EMBEDDINGS = str(EMBEDDINGS_DIR / "catalog_books.npy")

# Configuration
BATCH_SIZE = 64
EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # or "all-mpnet-base-v2"
# Example output: (num_rows, 384) for all-MiniLM-L6-v2
# Example output: (num_rows, 768) for all-mpnet-base-v2

# ============================================================
# Stage 3: Build Interaction Matrix
# ============================================================
# Input files
USER_IDX_PKL = str(PKL_DIR / "user_to_idx.pkl")
BOOK_IDX_PKL = str(PKL_DIR / "book_to_idx.pkl")

# Output files
OUTPUT_TRAIN_MATRIX = str(MATRICES_DIR / "train_matrix.npz")
OUTPUT_VAL_MATRIX = str(MATRICES_DIR / "val_matrix.npz")
OUTPUT_TEST_MATRIX = str(MATRICES_DIR / "test_matrix.npz")

# Configuration
TRAIN_TEST_SPLIT = 0.8
VAL_TEST_SPLIT = 0.5
RANDOM_STATE = 42

# ============================================================
# Stage 4: Train Collaborative Filtering
# ============================================================
# Output files
OUTPUT_ALS_MODEL = str(PKL_DIR / "als_model.pkl")
OUTPUT_USER_FACTORS = str(FACTORS_DIR / "user_factors.npy")
OUTPUT_BOOK_FACTORS = str(FACTORS_DIR / "book_factors.npy")


In [None]:
import ast
import html

import pandas as pd
from pydantic import ConfigDict, validate_call

# from config import *


# ----------------------
# Helper methods
# ----------------------
def parse_list_value(val):
    """
    Parse a string value into a list of cleaned strings.
    Args: val (str or NaN): The input string representing a list or comma-separated values.
    Returns: list[str]: A list of stripped strings, empty if input is NaN, empty, or cannot be parsed.
    """
    if pd.isna(val) or val.strip() == "":
        return []
    val = val.strip()

    # Try to parse as Python list
    if val.startswith("[") and val.endswith("]"):
        try:
            parsed = ast.literal_eval(val)
            return [c.strip() for c in parsed if isinstance(c, str)]
        except Exception:
            return []

    # Otherwise, comma-separated
    for sep in COMMON_DELIMS:
        val = val.replace(sep, ",")

    return [c.strip() for c in val.split(",") if c.strip()]


def normalize_author_column(author_col):
    """
    Normalize and clean a DataFrame column containing author names.
    Args: author_col (pd.Series): Column of raw author strings.
    Returns: pd.Series: Column where each row is a sorted list of valid author names in title case.
    """
    # basic cleanup before parsing
    cleaned_col = (
        author_col.fillna("")
        .astype(str)
        .str.replace(";", ",", regex=False)
        .str.replace("&", ",", regex=False)
        .str.replace(r"\s+and\s+", ",", regex=True)
        .str.replace(r"\(.*?editor.*?\)", "", regex=True, case=False)
        .str.replace(r"\beditor\b", "", regex=True, case=False)
        .str.replace(r"\bed\.\b", "", regex=True, case=False)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )

    # parse each row value in the column
    cleaned_col = cleaned_col.apply(parse_list_value)

    # convert valid authors to title case and remove invalid ones
    invalid_authors = {"unknown"}
    cleaned_col = cleaned_col.apply(
        lambda author_row: sorted(
            {item.title() for item in author_row if item.lower() not in invalid_authors}
        )
    )

    return cleaned_col


def normalize_genre_column(genre_col):
    """
    Normalize and clean a column of genres, converting to lowercase, trimming, removing short/invalid entries, and sorting.
    Args: genre_col (pd.Series): Column of genre strings.
    Returns: pd.Series: Column where each row is a sorted list of valid genres.
    """
    cleaned_col = (
        genre_col.fillna("")
        .astype(str)
        .str.strip()
        .str.replace("&", "and", regex=False)
    )

    # parse each row value in the column
    cleaned_col = cleaned_col.apply(parse_list_value)

    # lowercase, trim, remove very short/invalid entries
    cleaned_col = cleaned_col.apply(
        lambda lst: [g.lower().strip() for g in lst if len(g) > 2 and "..." not in g]
    )

    # sort
    cleaned_col = cleaned_col.apply(lambda lst: sorted(set(lst)))

    return cleaned_col


def map_genres_to_top_or_other(genres_row, top_genres):
    """
    Map genres to a top-N list, replacing non-top genres with 'other'.
    Args:
        genres_row (list[str]): List of genres for a book.
        top_genres (set[str]): Set of top-N genres.
    Returns: list[str]: Sorted list with genres mapped to top or 'other'.
    """
    # any genre not in top_genres → "other"
    genre_list = set()
    for g in genres_row:
        if g in top_genres:
            genre_list.add(g)
        else:
            genre_list.add("other")

    return sorted(genre_list)


def reduce_to_top_genres(genre_col):
    """
    Reduce genres in a column to top-N most common genres, mapping all other genres to 'other'.
    Args: genre_col (pd.Series): Column of lists of genres.
    Returns: pd.Series: Column of lists with top-N genres or 'other'.
    """
    all_genres = genre_col.explode()
    top_genres = set(all_genres.value_counts().head(TOP_N_GENRES).index)
    return genre_col.apply(map_genres_to_top_or_other, args=(top_genres,))


def normalize_text_field(cleaned_col):
    """
    Clean a text column by removing HTML tags, unescaping HTML entities, removing escaped characters and control characters, and collapsing whitespace.
    Args: cleaned_col (pd.Series): Column of strings.
    Returns: pd.Series: Cleaned string column.
    """
    cleaned_col = cleaned_col.fillna("").astype(str)
    cleaned_col = cleaned_col.str.replace(r"<[^>]+>", "", regex=True)
    cleaned_col = cleaned_col.apply(html.unescape)
    cleaned_col = cleaned_col.str.replace(r"[\n\t\r]", " ", regex=True)
    cleaned_col = cleaned_col.str.strip().str.replace(r"\s+", " ", regex=True)
    cleaned_col = cleaned_col.apply(
        lambda s: "".join(ch for ch in s if ch.isprintable())
    )

    return cleaned_col


def keep_usable_books(df):
    """
    Filter books to retain only those with complete and valid metadata: title, at least one author, at least one genre, and description length >= MIN_DESC_LENGTH.
    Args: df (pd.DataFrame): DataFrame containing book metadata.
    Returns: tuple: (filtered DataFrame of usable books, fraction of books that are usable)
    """
    mask = (
        (df["title"].notna() & df["title"].str.strip() != "")
        & (df["description"].str.strip().str.len() >= MIN_DESC_LENGTH)
        & (df["authors"].notna() & df["authors"].apply(lambda lst: len(lst) > 0))
        & (df["genres"].apply(lambda lst: len(lst) > 0))
    )

    return df[mask].copy(), mask.mean()


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def clean_books_data(logger, books_df) -> pd.DataFrame:
    """
    Clean and normalize the books DataFrame including authors, genres, titles, and descriptions.
    Args: books_df (pd.DataFrame): Raw books DataFrame.
    Returns: pd.DataFrame: Cleaned catalog of usable books.
    """
    books_df["authors"] = normalize_author_column(books_df["authors"])
    books_df["genres"] = reduce_to_top_genres(
        normalize_genre_column(books_df["categories"])
    )
    books_df["title"] = normalize_text_field(books_df["title"])
    books_df["description"] = normalize_text_field(books_df["description"])

    # A usable book must have: title, at least one author, description length ≥ X, at least one category
    books_df, usable_ratio = keep_usable_books(books_df)
    logger.info(f"Usable books: {usable_ratio:.2%}")

    # NOTE: No title + author duplicates were found in this dataset

    # Create a unique sequential integer starting from 0 for each book
    books_df["book_id"] = range(1, len(books_df) + 1)

    return books_df


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def clean_ratings_data(logger, ratings_df, cleaned_books_df) -> pd.DataFrame:
    """Clean ratings DataFrame, normalize titles, convert timestamps, deduplicate, transform scores to confidence, and filter by user/item thresholds."""

    # Drop rows with missing title, user_id, review/score
    ratings_df = ratings_df[ratings_df["title"].notna()]
    ratings_df = ratings_df[ratings_df["user_id"].notna()]
    ratings_df = ratings_df[ratings_df["review/score"].notna()]

    # Normalize titles with same techniques as for books data
    ratings_df["title"] = normalize_text_field(ratings_df["title"])
    ratings_df = pd.merge(
        ratings_df, cleaned_books_df[["book_id", "title"]], on="title", how="inner"
    )
    logger.info(f"ratings_df.columns: {ratings_df.columns.tolist()}")

    # Parse review/time from epoch time to datetime UTC
    ratings_df["datetime"] = pd.to_datetime(
        pd.to_numeric(ratings_df["review/time"]), unit="s", utc=True
    )

    # Deduplicate (user, book) pairs, group by (user_id, title) and keep one with most recent review/time
    ratings_df = ratings_df.sort_values("review/time").drop_duplicates(
        subset=["user_id", "title"], keep="last"
    )

    # Transform 1-5 ratings into confidence weights: scores ≤3 become 0, 4 becomes 1, and 5 becomes 2
    ratings_df["review/score"] = pd.to_numeric(
        ratings_df["review/score"], errors="coerce"
    )
    ratings_df["confidence"] = ratings_df["review/score"].clip(lower=3) - 3

    # Filter out interactions with zero confidence (ratings ≤ 3)
    logger.info(f"Ratings data size before filtering confidence=0: {len(ratings_df):,}")
    len_confidence_0 = len(ratings_df[ratings_df["confidence"] == 0])
    ratings_df = ratings_df[ratings_df["confidence"] > 0].copy()

    logger.info(f"Ratings data size after filtering confidence=0: {len(ratings_df):,}")
    logger.info(f"Removed {len_confidence_0:,} zero-confidence rows")
    return ratings_df


In [4]:
logger = setup_logging("analysis", "logs/analysis.log")

books_df = safe_read_csv(INPUT_BOOKS, INPUT_COLS_BOOKS)
catalog_books_df = clean_books_data(logger, books_df)

ratings_df = safe_read_csv(INPUT_RATINGS, INPUT_COLS_RATINGS)
ratings_df = clean_ratings_data(logger, ratings_df, catalog_books_df)
logger.info(f"✓ Ratings data cleaned: shape={ratings_df.shape}")

2026-01-05 19:46:53,470 [INFO] Usable books: 67.71%
2026-01-05 19:47:36,744 [INFO] ratings_df.columns: ['title', 'user_id', 'profilename', 'review/helpfulness', 'review/score', 'review/time', 'review/summary', 'review/text', 'book_id']
2026-01-05 19:47:41,207 [INFO] Ratings data size before filtering confidence=0: 1,705,374
2026-01-05 19:47:41,822 [INFO] Ratings data size after filtering confidence=0: 1,360,308
2026-01-05 19:47:41,822 [INFO] Removed 345,066 zero-confidence rows
2026-01-05 19:47:42,097 [INFO] ✓ Ratings data cleaned: shape=(1360308, 11)


In [24]:
while True:
    prev_len = len(ratings_df)

    user_counts = ratings_df["user_id"].value_counts()
    ratings_df = ratings_df[
        ratings_df["user_id"].isin(
            user_counts[
                (user_counts >= MIN_USER_INTERACTIONS)
                & (user_counts <= MAX_USER_INTERACTIONS)
            ].index
        )
    ]

    book_counts = ratings_df["book_id"].value_counts()
    ratings_df = ratings_df[
        ratings_df["book_id"].isin(
            book_counts[book_counts >= MIN_BOOK_INTERACTIONS].index
        )
    ]

    if len(ratings_df) == prev_len:
        break


# Create Index Mappings
unique_users = ratings_df["user_id"].unique()
unique_books = ratings_df["book_id"].unique()
# Example: unique_books = [5, 12, 8, 100, 7, ...]  (in order of appearance)

n_users = len(unique_users)
n_cf_books = len(unique_books)

logger.info("Users: %s", f"{n_users:,}")
logger.info("CF-trainable books: %s", f"{n_cf_books:,}")
# 2026-01-05 19:44:45,116 [INFO] Users: 25,999
# 2026-01-05 19:44:45,117 [INFO] CF-trainable books: 12,183

2026-01-09 23:51:19,228 [INFO] Users: 25,999
2026-01-09 23:51:19,233 [INFO] CF-trainable books: 12,183


In [29]:
import pickle

def load_index_mappings(pkl_file):
    """
    Load item index mappings from pickle files.

    Returns:
        item_to_idx: dict mapping item_id (book or user) → CF matrix column index
        idx_to_item_id: dict mapping CF matrix column index → item_id
    """
    with open(pkl_file, "rb") as f:
        item_to_idx = pickle.load(f)

    # Create reverse mapping: CF index → item_id
    idx_to_item_id = {cf_idx: item_id for item_id, cf_idx in item_to_idx.items()}

    return item_to_idx, idx_to_item_id

def build_cf_to_catalog_mapping(idx_to_book_id):
    """
    Build mapping from CF book indices to catalog indices.
    
    Args:
        idx_to_book_id: Mapping from CF index to book_id
    
    Returns:
        dict: CF index → catalog index mapping
    """
    cf_to_catalog_map = {}
    for cf_idx, book_id in idx_to_book_id.items():
        catalog_idx = book_id - 1  # book_id starts at 1, catalog indices start at 0
        cf_to_catalog_map[cf_idx] = catalog_idx
    
    return cf_to_catalog_map

user_to_idx, idx_to_user_id = load_index_mappings(USER_IDX_PKL)
book_to_idx, idx_to_book_id = load_index_mappings(BOOK_IDX_PKL)
cf_to_catalog_map = build_cf_to_catalog_mapping(idx_to_book_id)

In [16]:
catalog_books_df.head()
catalog_books_df.iloc[0]

title                                   Dr. Seuss: American Icon
description    Philip Nel takes a fascinating look into the k...
authors                                             [Philip Nel]
infolink       http://books.google.nl/books?id=IjvHQsCn_pgC&d...
categories                         ['Biography & Autobiography']
genres                             [biography and autobiography]
book_id                                                        1
Name: 1, dtype: object

In [6]:
catalog_books_df[catalog_books_df["book_id"] == 47723]
catalog_books_df.iloc[0]

title                                   Dr. Seuss: American Icon
description    Philip Nel takes a fascinating look into the k...
authors                                             [Philip Nel]
infolink       http://books.google.nl/books?id=IjvHQsCn_pgC&d...
categories                         ['Biography & Autobiography']
genres                             [biography and autobiography]
book_id                                                        1
Name: 1, dtype: object

In [None]:
print(len(book_to_idx)) # n_cf_books
print(len(cf_to_catalog_map)) # n_cf_books

print(cf_to_catalog_map[199])
print(idx_to_book_id[199])

12183
12183


In [None]:
top_10_users = (
    ratings_df
    .groupby('user_id')
    .size()
    .sort_values(ascending=False)
    .head(10)
)
top_10_users

user_id
A1G37DFO8MQW0M    479
A1EKTLUL24HDG8    381
A1T17LMQABMBN5    314
A1NC9AGZOBI0M1    306
AHXAPVSHPJ6OJ     304
A1MC6BFHWY6WC3    296
A2GBJQ9THOYDAJ    290
A2ODBHT4URXVXQ    289
A319KYEIAZ3SON    289
A30KEXFT9SILL6    287
dtype: int64

In [32]:
user_indices = [user_to_idx[user_id] for user_id in top_10_users.index]
user_indices

[5647, 238, 1242, 7158, 2, 3897, 594, 3309, 131, 145]

In [33]:
bottom_10_users = (
    ratings_df
    .groupby('user_id')
    .size()
    .sort_values(ascending=True)
    .head(10)
)

bottom_10_users

user_id
A24MDW8RHNBRON    5
A5JWW7BDKKQWW     5
A5JWI3NM9N97D     5
A29PQDCZSDEULN    5
A29QQSMOLH34P6    5
A29QZZUCOZQSAS    5
A5JIJQ50U3829     5
A29ROQMSJIXZ2M    5
A29OFQ7ZDJLCI2    5
A29S5DRKSYUIHU    5
dtype: int64

In [34]:
bottom_user_indices = [user_to_idx[user_id] for user_id in bottom_10_users.index]
bottom_user_indices

[9393, 21756, 4570, 8260, 25932, 3818, 17023, 20445, 118, 18286]

In [1]:
import numpy as np

# Load the .npy file
data = np.load('/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/data/embeddings/catalog_books_384.npy')

# Get the dimensions
shape = data.shape  # Returns a tuple (e.g., (100, 50) for 100 rows and 50 columns)
num_dims = data.ndim  # Returns an integer (e.g., 2)

print(f"Shape: {shape}")
print(f"Number of dimensions: {num_dims}")


Shape: (143815, 768)
Number of dimensions: 2


In [2]:
import faiss

# Load the .index file
index = faiss.read_index('/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/data/embeddings/catalog_books_768.index')

# Get key properties
dimension = index.d        # The size of each vector (e.g., 768)
total_vectors = index.ntotal # The number of vectors currently in the index

print(f"Dimension (d): {dimension}")
print(f"Total Vectors (ntotal): {total_vectors}")


Dimension (d): 768
Total Vectors (ntotal): 143815


In [7]:
import pandas as pd
ratings_df = pd.read_feather('/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/data/clean/cleaned_ratings_data.ftr')
books_df = pd.read_feather('/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/data/clean/cleaned_books_data.ftr')

In [None]:
ratings_df = ratings_df[ratings_df['review/score'] > 3]
top_users = (ratings_df.groupby('user_id').size().reset_index(name='review/score').sort_values(by='review/score', ascending=False))
print(top_users)
# A14OJS0VWMOSWO

                      user_id  review/score
701770                    nan         45158
24354          A14OJS0VWMOSWO          4248
597650            AFVQZQ8PW0L          2775
605360          AHD101501WCN1          1060
103317         A1K1JW1C5CUSUZ           780
...                       ...           ...
271734         A2GR3JJ113IPNO             1
271736         A2GR3SJIXGILE5             1
271737         A2GR4AYN9175CS             1
271738         A2GR4MU0DZ133O             1
0       A00109803PZJ91RLT7DPN             1

[701771 rows x 2 columns]


In [2]:
import pickle
file_path = '/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/data/pkl/user_to_idx.pkl'
with open(file_path, 'rb') as f:
    user_pkl = pickle.load(f)

In [5]:
from itertools import islice

for k, v in user_pkl.items():
    if k == 'A14OJS0VWMOSWO':
        print(k, v)

A14OJS0VWMOSWO 16


In [None]:
idx_map = {val: key for key, val in user_pkl.items()}
idx_map[100]

# unique key to cf idx

'A8DJ9EU2QP2JM'