## Baseline
### Рекомендации с основой на жанровое сходство книг (Jaccard similarity) и популярность
**Пока:**
- Предобработали данные, создали словари, добавили кэширование для ускорения работы
- Сделали гибридную модель скоринга (w1*Jaccard + w2*popularity), так же сделали адаптивные веса для популярности (пользователь читает чаще популярные книги -> больше вес для популярности во время скоринга)
- Немного протестировали

**Далее:**
- Больше тестировать (разное количество пользователей и разное количество книг в истории)
- Собрать больше метрик (количество хитов довольно мало, попробуем посмотреть на данные "оценим своими глазами", попробуем другие метрики)

In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

### Data preparation

In [97]:
interactions = pd.read_csv('data/interactions.csv')
items = pd.read_csv('data/items.csv')
users = pd.read_csv('data/users.csv')

In [98]:
def preprocess_items(items_df):
    
    items_processed = items_df.copy()

    items_processed['genres_list'] = items_processed['genres'].fillna('').apply(
        lambda x: [genre.strip() for genre in str(x).split(',') if genre.strip() and genre.strip() != 'nan']
    )
    
    items_processed['authors_list'] = items_processed['authors'].fillna('').apply(
        lambda x: [author.strip() for author in str(x).split(',') if author.strip() and author.strip() != 'nan']
    )
    
    def clean_year(year_value):
        if pd.isna(year_value):
            return np.nan
        
        year_str = str(year_value).strip()
        
        if not year_str:
            return np.nan
        
        try:
            year_num = int(float(year_str))
            if year_num <= 2025:
                return year_num
            else:
                return np.nan
        except ValueError:
            return np.nan
    
    items_processed['year_clean'] = items_processed['year'].apply(clean_year)
    
    
    return items_processed
def preprocess_interactions(interactions_df):
    
    interactions_processed = interactions_df.copy()
    
    interactions_processed['start_date_dt'] = pd.to_datetime(interactions_processed['start_date'])
    
    interactions_processed['start_year'] = interactions_processed['start_date_dt'].dt.year
    interactions_processed['start_month'] = interactions_processed['start_date_dt'].dt.month
    interactions_processed['start_day_of_week'] = interactions_processed['start_date_dt'].dt.dayofweek
    interactions_processed['start_quarter'] = interactions_processed['start_date_dt'].dt.quarter
    interactions_processed['is_weekend'] = interactions_processed['start_day_of_week'].isin([5, 6]).astype(int)
    
    def categorize_progress(progress):
        if progress == 0:
            return 'not_started'
        elif progress < 25:
            return 'barely_started'
        elif progress < 50:
            return 'quarter_read'
        elif progress < 75:
            return 'half_read'
        elif progress < 100:
            return 'almost_finished'
        else:
            return 'completed'
    
    interactions_processed['progress_category'] = interactions_processed['progress'].apply(categorize_progress)
    
    interactions_processed['season'] = interactions_processed['start_month'].map({
        12: 'winter', 1: 'winter', 2: 'winter',
        3: 'spring', 4: 'spring', 5: 'spring',
        6: 'summer', 7: 'summer', 8: 'summer',
        9: 'autumn', 10: 'autumn', 11: 'autumn'
    })
    
    
    return interactions_processed

In [99]:
interactions = preprocess_interactions(interactions)
items = preprocess_items(items)

In [100]:
min_interactions = 50
book_count = interactions.groupby('user_id').size()
valid_users = book_count[book_count >= min_interactions].index

print(f"Users with с >= {min_interactions} 'readed books': {len(valid_users)}")
print(f"Total number of users: {len(book_count)}")


Users with с >= 50 'readed books': 2542
Total number of users: 151600


In [101]:
filtered_interactions = interactions[interactions['user_id'].isin(valid_users)].copy()
filtered_interactions = filtered_interactions.sort_values(['user_id', 'start_date_dt'])
filtered_interactions.head()

Unnamed: 0,user_id,item_id,progress,rating,start_date,start_date_dt,start_year,start_month,start_day_of_week,start_quarter,is_weekend,progress_category,season
34587,21,193711,27,5.0,2018-01-17,2018-01-17,2018,1,2,1,0,quarter_read,winter
249157,21,24213,54,,2018-05-01,2018-05-01,2018,5,1,2,0,half_read,spring
293131,21,32399,85,5.0,2018-05-22,2018-05-22,2018,5,1,2,0,almost_finished,spring
378675,21,38538,7,,2018-07-02,2018-07-02,2018,7,0,3,0,barely_started,summer
693090,21,155015,16,1.0,2018-11-29,2018-11-29,2018,11,3,4,0,barely_started,autumn


In [102]:
all_genres = set()
for genres_list in items['genres_list']:
    all_genres.update(genres_list)
genres_encoding = {genre: idx for idx, genre in enumerate(all_genres)}

items['genres_list_idx'] = items['genres_list'].apply(lambda x: [genres_encoding[genre] for genre in x])

In [103]:
items.head()

Unnamed: 0,id,title,genres,authors,year,genres_list,authors_list,year_clean,genres_list_idx
0,128115,Ворон-челобитчик,"Зарубежные детские книги,Сказки,Зарубежная кла...",Михаил Салтыков-Щедрин,1886,"[Зарубежные детские книги, Сказки, Зарубежная ...",[Михаил Салтыков-Щедрин],1886.0,"[547, 580, 227, 247, 5]"
1,210979,Скрипка Ротшильда,"Классическая проза,Литература 19 века,Русская ...",Антон Чехов,1894,"[Классическая проза, Литература 19 века, Русск...",[Антон Чехов],1894.0,"[83, 247, 5]"
2,95632,Испорченные дети,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1869,"[Зарубежная классика, Классическая проза, Лите...",[Михаил Салтыков-Щедрин],1869.0,"[227, 83, 247, 5]"
3,247906,Странный человек,"Пьесы и драматургия,Литература 19 века",Михаил Лермонтов,1831,"[Пьесы и драматургия, Литература 19 века]",[Михаил Лермонтов],1831.0,"[472, 247]"
4,294280,Господа ташкентцы,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1873,"[Зарубежная классика, Классическая проза, Лите...",[Михаил Салтыков-Щедрин],1873.0,"[227, 83, 247, 5]"


In [104]:
items_cp = items.copy()
items_cp.set_index('id', inplace=True)

In [105]:
genre_index = {}
for idx, row in items.iterrows():
    item_id = row['id']
    genres = row['genres_list_idx']
    for genre in genres:
        if genre not in genre_index:
            genre_index[genre] = []
        genre_index[genre].append(item_id)

In [106]:
# Encoding of genres
genres_encoding

{'Социальная фантастика': 0,
 'Классика фэнтези': 1,
 'Юридические журналы': 2,
 'Логика': 3,
 'Корпоративные финансы': 4,
 'Русская классика': 5,
 'Зарубежная литература о культуре и искусстве': 6,
 'Дзэн-буддизм': 7,
 'Поиск и подбор персонала': 8,
 'IT-менеджмент': 9,
 'Психологические тренинги': 10,
 'Классические любовные романы': 11,
 'Логистика': 12,
 'Маркетинг и реклама': 13,
 'Проектирование в строительстве': 14,
 'Школьные учебники по литературе': 15,
 'Микроэкономика': 16,
 'Литература 11 класс': 17,
 'Зарубежная драматургия': 18,
 'Привлечение клиентов': 19,
 'Семейная психология': 20,
 'Строительные конструкции': 21,
 'образцы': 22,
 'ЕГЭ по литературе': 23,
 'Список школьной литературы 7-8 класс': 24,
 'Конституция РФ': 25,
 'Список школьной литературы 9 класс': 26,
 'Домашние животные': 27,
 'Техническая литература': 28,
 'Forex': 29,
 'Компьютерное железо': 30,
 'Эротическое фэнтези': 31,
 'ГИА по географии (ОГЭ': 32,
 'Мистика': 33,
 'Православная медицина': 34,
 'Мат

In [107]:
# reverted index: genre -> items
genre_index

{547: [128115,
  135463,
  307,
  133138,
  164470,
  231627,
  71261,
  216637,
  32534,
  14876,
  12987,
  273933,
  172403,
  66088,
  202908,
  122203,
  193212,
  64152,
  269753,
  3638,
  217410,
  43662,
  6722,
  214363,
  33042,
  182969,
  79124,
  225951,
  293630,
  169398,
  56838,
  110811,
  14770,
  163761,
  2669,
  197375,
  84694,
  31238,
  126091,
  303414,
  213812,
  125339,
  33294,
  268952,
  198950,
  273855,
  244039,
  104138,
  26377,
  11163,
  141296,
  226612,
  208725,
  168987,
  117071,
  122766,
  44084,
  127840,
  309997,
  285466,
  268874,
  248921,
  262678,
  270975,
  84417,
  215093,
  9683,
  231431,
  216552,
  271987,
  742,
  233987,
  226587,
  189942,
  130184,
  68530,
  311070,
  34914,
  166260,
  290544,
  202331,
  148730,
  260010,
  299359,
  151088,
  287688,
  261785,
  316591,
  138106,
  24211,
  235184,
  176567,
  276682,
  228911,
  265315,
  300687,
  55615,
  31765,
  284467,
  146681,
  221775,
  57989,
  43439,
  68

In [108]:
item_to_genres = dict(zip(items_cp.index, items_cp['genres_list_idx']))
item_to_genres_list = dict(zip(items_cp.index, items_cp['genres_list_idx']))

### Main algorithm

In [109]:
top_n = 15
decay_rate = 0.02

In [110]:
def split_user_history(user_interactions: pd.DataFrame, top_n=15):
    train = user_interactions.iloc[:-top_n]
    test = user_interactions.iloc[-top_n:]
    return train, test

**Weights with exponential decay** (last one have more)

In [111]:
def get_candidates(user_interactions: pd.DataFrame) -> set:

    user_genres = set()
    for item in user_interactions['item_id']:
        item_genres = item_to_genres.get(item_id, [])
        user_genres.update(item_genres)

    candidate_item_ids = set()
    for genre in user_genres:
        candidate_item_ids.update(genre_index.get(genre, []))

    candidate_item_ids.difference_update(user_interactions['item_id'])

    return candidate_item_ids

In [112]:
weights_cache = {}

def user_interactions_weights(n_books, rate=0.02) -> list:
    if n_books in weights_cache:
        return weights_cache[n_books]

    weights = []
    for idx in range(n_books):
        weight = np.exp(-rate * (n_books - 1 - idx))
        weights.append(weight)

    weights_cache[n_books] = weights
    return weights


def adaptive_popularity_weight(user_interactions: pd.DataFrame, 
                              interactions_df: pd.DataFrame) -> float:
    
    # Avg weighted popularity of user items
    user_books = user_interactions['item_id'].tolist()
    popularity_counts = interactions_df['item_id'].value_counts()
    
    user_avg_popularity = np.mean([popularity_counts.get(book, 0) for book in user_books])
    max_popularity = popularity_counts.max()
    
    # if user reads more popular books -> popularity weight increase
    normalized_user_pop = user_avg_popularity / max_popularity
    
    return min(0.5, normalized_user_pop)  # limit

In [None]:
def jaccard_similarity_inverted(list1: list, list2: list) -> float:
    set1 = set(list1)
    set2 = set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0


def weighted_jaccard_score(user_interactions: pd.DataFrame, candidate_item_id: int) -> float:
    candidate_genres = item_to_genres.get(candidate_item_id, [])

    if not isinstance(candidate_genres, list):
        return 0.0

    result_score = 0.0
    result_weight = 0.0
    weights = user_interactions_weights(user_interactions.shape[0], rate=decay_rate)

    item_ids = user_interactions['item_id'].values

    for idx, item_id in enumerate(item_ids):
        user_genres = item_to_genres.get(item_id, [])
        if not user_genres:
            continue

        jaccard_sim = jaccard_similarity_inverted(candidate_genres, user_genres)

        result_score += jaccard_sim * weights[idx]
        result_weight += weights[idx]

    # normalization by total weight
    return result_score / result_weight if result_weight > 0 else 0.0

max_popularity = 0

popularity_cache = {}

def compute_popularity_score(item_id: int, interactions_df: pd.DataFrame) -> float:
    if item_id in popularity_cache:
        return popularity_cache[item_id]

    popularity_counts = interactions_df['item_id'].value_counts()
    item_popularity = popularity_counts.get(item_id, 0)
    
    max_popularity = popularity_counts.max()
    score = item_popularity / max_popularity if max_popularity > 0 else 0
    popularity_cache[item_id] = score
    return score

def hybrid_weighted_score(user_interactions: pd.DataFrame, candidate_item_id: int, interactions_df: pd.DataFrame, jaccard_weight=0.7, popularity_weight=0.3) -> float:
    
    jaccard_score = weighted_jaccard_score(user_interactions, candidate_item_id)
    
    popularity_score = compute_popularity_score(candidate_item_id, interactions_df)
    
    final_score = (jaccard_weight * jaccard_score + 
                   popularity_weight * popularity_score)
    
    return final_score




In [None]:
def recommend_items(user_interactions: pd.DataFrame, interactions_df: pd.DataFrame, top_n=10) -> list[tuple[int, float]]:
    candidate_items = get_candidates(user_interactions)


    popularity_weight = adaptive_popularity_weight(user_interactions, interactions_df)
    jaccard_weight = 1 - popularity_weight

    scores = []
    for item_id in candidate_items:
        hybrid_score = hybrid_weighted_score(user_interactions, item_id, interactions_df, jaccard_weight, popularity_weight)
        scores.append((item_id, hybrid_score))

    scores.sort(key=lambda x: x[1], reverse=True) 

    return scores[:top_n]

In [115]:
def recommender(interactions_df: pd.DataFrame, valid_users: np.ndarray, top_n=10, select_n=100) -> pd.DataFrame:

    selected_users = np.random.choice(valid_users, size=select_n, replace=False).tolist()
    results = []

    for user_id in tqdm(selected_users, desc="Users processed"):
        user_interactions = interactions_df[interactions_df['user_id'] == user_id]
        train, test = split_user_history(user_interactions, top_n=top_n)

        recommended = recommend_items(train, interactions_df, top_n=top_n)
        recommended_item_ids = set([item_id for item_id, score in recommended])
        scores = [score for item_id, score in recommended]

        actual_item_ids = set(test['item_id'])
        previous_item_ids = train['item_id']

        hits = recommended_item_ids.intersection(actual_item_ids)

        results.append({
            'user_id': user_id,
            'previous_item_ids': list(previous_item_ids),
            'recommended_item_ids': list(recommended_item_ids),
            'actual_item_ids': list(actual_item_ids),
            'hits': list(hits),
            'scores': scores
        })

    recommendations = pd.DataFrame(results)
    return recommendations

In [116]:
np.random.seed(42)

In [117]:
recommendations = recommender(filtered_interactions, valid_users, top_n=top_n, select_n=100)

Users processed: 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


In [118]:
recommendations

Unnamed: 0,user_id,previous_item_ids,recommended_item_ids,actual_item_ids,hits,scores
0,102557,"[202166, 27599, 147676, 40236, 196913, 92867, ...","[205378, 222915, 168708, 118693, 260234, 11949...","[198593, 80195, 61478, 302317, 296205, 312432,...",[],"[0.08692477623285626, 0.08111231196560474, 0.0..."
1,46273,"[5697, 217859, 201262, 236876, 319332, 279975,...","[217527, 222915, 80515, 233256, 260234, 302797...","[28864, 49986, 308613, 153032, 215721, 67946, ...",[],"[0.06865781614185286, 0.05933226558359958, 0.0..."
2,32856,"[259241, 125427, 4442, 152378, 148424, 268138,...","[218725, 133158, 318694, 76429, 272178, 258483...","[295936, 320258, 47307, 76429, 157422, 69327, ...",[76429],"[0.18985492864997433, 0.17670165748459532, 0.1..."
3,90919,"[45266, 305076, 169297, 112312, 252831, 65490,...","[139394, 88771, 168708, 55687, 309544, 254504,...","[25633, 267813, 282759, 274761, 223818, 241228...",[],"[0.11395509166000892, 0.11395509166000892, 0.1..."
4,36783,"[97371, 49845, 265809, 16703, 98837, 161454, 1...","[205378, 168708, 118693, 245642, 119499, 29092...","[265984, 135490, 151172, 95814, 7271, 221191, ...",[],"[0.20993217277943596, 0.1970238605709256, 0.18..."
...,...,...,...,...,...,...
95,70885,"[223633, 15169, 15032, 238820, 121391, 159130,...","[128672, 114177, 84032, 68295, 192009, 73417, ...","[2028, 88397, 187501, 225135, 75951, 39727, 22...",[],"[0.3242030810138769, 0.32390550923241634, 0.32..."
96,147018,"[259980, 118939, 318418, 189902, 32504, 310587...","[128672, 114177, 68295, 192009, 73417, 292210,...","[97954, 50340, 30053, 40997, 296998, 262775, 1...",[201237],"[0.31095026205257414, 0.3092179618362479, 0.30..."
97,85357,"[11062, 159580, 175906, 109088, 9806, 228424, ...","[318694, 173161, 305706, 64811, 187406, 35859,...","[145444, 318886, 115389, 296840, 3722, 139916,...",[],"[0.09348101408902046, 0.08859772207568646, 0.0..."
98,82999,"[91955, 231042, 112937, 78244, 247773, 262881,...","[158369, 88771, 55687, 309544, 254504, 33481, ...","[278465, 64259, 21444, 76359, 71527, 318442, 2...",[],"[0.2803539505558586, 0.28003182343084687, 0.28..."


In [119]:
max(len(hits) for hits in recommendations['hits'])

3

In [121]:
np.mean([len(hits) for hits in recommendations['hits']])

np.float64(0.1)

### Iterations
- 1: Simple Jaccard with same weights for books
- 2: Added exponential weights to the books in 'train' set of user interactions
- 3: Hybrid scoring, added popularity and adaprive weights for users
- 4: Optimization: caching, dicts, data access optimization

#### Next
### Content-based using item vectors