In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
from tqdm.auto import tqdm
tqdm.pandas()

from russian_names import RussianNames
import matplotlib.pyplot as plt


In [10]:
users = pd.read_csv('research/processed_data_1_stage/users.csv', sep=';')
groups = pd.read_csv('research/processed_data_1_stage/groups.csv', sep=';')
attend = pd.read_csv('research/processed_data_1_stage/attend.csv', sep=';')
test = pd.read_csv('research/raw_data/test.csv')
group_to_recommend = pd.read_csv('research/raw_data/group_list_to_recommend.csv', header=None)

In [11]:
weekday_to_number = {'Пн': 0,
                    'Вт': 1,
                    'Ср': 2,
                    'Чт': 3,
                    'Пт': 4,
                    'Сб': 5,
                    'Вс': 6}
number_to_weekday = {0: 'Пн',
                    1: 'Вт',
                    2: 'Ср',
                    3: 'Чт',
                    4: 'Пт',
                    5: 'Сб',
                    6: 'Вс'}

In [12]:
# определение активных групп. Будем считать активными те группы, на которые пришел хотя бы один человек с начала года
attend['lesson_date'] = pd.to_datetime(attend['lesson_date'])

In [13]:
users = users[users['user_id'].isin(attend['user_id'])]

attend = attend[attend['user_id'].isin(users['user_id'])]
attend = attend[attend['group_id'].isin(groups['group_id'])]

In [14]:
groups['category 1'] = groups['category 1'].str.lower().str.strip()
groups['category 2'] = groups['category 2'].str.lower().str.strip()
groups['name'] = groups['name'].str.lower().str.strip()
groups['weekday_1'] = groups['weekday_1'].map(number_to_weekday)
groups['weekday_2'] = groups['weekday_2'].map(number_to_weekday)

In [15]:
groups = groups[~groups['weekday_1'].isna()].reset_index(drop=True)

In [16]:
groups_to_backend = groups[groups.group_id.isin(group_to_recommend.values.reshape(1, -1).tolist()[0])][['group_id',
                                                                                                        'category 1',
                                                                                                        'category 2',
                                                                                                        'name',
                                                                                                        'district',
                                                                                                        'region',
                                                                                                        'street',
                                                                                                        'home',
                                                                                                        'online',
                                                                                                        'description',
                                                                                                        'weekday_1',
                                                                                                        'weekday_2',
                                                                                                        'active_shedule']].copy()

In [17]:
locations = groups_to_backend[['district', 'region']].drop_duplicates()
categories = groups_to_backend[['category 1', 'category 2', 'name']].drop_duplicates()

In [18]:
groups_to_backend.to_csv('research/backend_data/groups.csv', sep=';', index=False)
locations.to_csv('research/backend_data/locations.csv', sep=';', index=False)
categories.to_csv('research/backend_data/categories.csv', sep=';', index=False)

# Data Transforming

In [22]:
rare_regions = {'поселение михайлово-ярцевское': 'поселение вороновское',
 'поселение киевский': 'поселение вороновское',
 'поселение "мосрентген"': 'теплый стан',
 'поселение кокошкино':'поселение вороновское',
 'поселение роговское':'поселение вороновское',
 'поселение новофедоровское':'поселение вороновское',
 'поселение марушкинское': 'поселение вороновское',
 'поселение краснопахорское': 'поселение вороновское',
 'поселение кленовское': 'поселение вороновское',
                'вание савелки': 'савелки'}
groups['region'] = groups['region'].apply(lambda x: rare_regions[x] if x in rare_regions.keys() else x)

In [23]:
rare_categories_1 = {'Спецпроект / Интеллектуальный клуб': 'Игры',
                    'Спецпроект / Серебряный университет': 'Образование',
                    'Спецпроект / Московский театрал"': 'Творчество',
                    'Спецпроект / Тренировки долголетия (спецпроект по медицинской реабилитации)':'Физическая активность'
                    }
groups['category 1'] = groups['category 1'].apply(lambda x: rare_categories_1[x] if x in rare_categories_1.keys() else x)

In [24]:
groups['delta'] = (groups['delta_2'].fillna(0) + groups['delta_1']) / 2
groups['start_hour'] = (groups['start_hour_2'].fillna(0) + groups['start_hour_1']) / 2
groups['finish_hour'] = (groups['finish_hour_2'].fillna(0) + groups['finish_hour_1']) / 2

In [25]:
groups = groups[groups['category 1'] != 'центры московского долголетия']
groups = groups[['group_id', 'category 1', 'category 2', 'name', 'district', 'region',
                 'online', 'marker', 'topic', 'weekday_1', 'delta', 'weekday_2',
                 'start_hour', 'finish_hour']]

### Filling NaNs

In [26]:
users = users[['user_id', 'sex', 'age', 'active_in_months', 'active_in_years', 'user_district', 'user_region']]

In [27]:
attend['lesson_weekday'] = attend['lesson_weekday'].map(number_to_weekday)

In [28]:
attend = attend[['group_id', 'user_id', 'lesson_date','lesson_weekday', 'delta', 'start_hour', 'finish_hour']]

In [29]:
grouped_attend = attend.groupby(by=['user_id', 'group_id'], as_index=False)[['user_id', 'group_id']].first()

In [30]:
lesson_week_min = attend.groupby(by=['user_id', 'group_id'])['lesson_weekday'].min().values
lesson_week_max = attend.groupby(by=['user_id', 'group_id'])['lesson_weekday'].max().values
delta_mean = attend.groupby(by=['user_id', 'group_id'])['delta'].mean().values
start_hour_mean = attend.groupby(by=['user_id', 'group_id'])['start_hour'].mean().values
finish_hour_mean = attend.groupby(by=['user_id', 'group_id'])['finish_hour'].mean().values
number_of_lessons = attend.groupby(by=['user_id', 'group_id'])['finish_hour'].count().values

In [31]:
grouped_attend['weekday_1'] = lesson_week_min
grouped_attend['weekday_2'] = lesson_week_max
grouped_attend['delta'] = delta_mean
grouped_attend['start_hour'] = start_hour_mean
grouped_attend['finish_hour'] = finish_hour_mean
grouped_attend['number_of_lessons'] = number_of_lessons

In [32]:
grouped_attend['weekday_2'] = grouped_attend['weekday_2'].mask(grouped_attend['weekday_1'] == grouped_attend['weekday_2']).fillna('неизвестно')

In [33]:
groups['district'].fillna('неизвестно', inplace=True)
groups['region'].fillna('неизвестно', inplace=True)
groups['weekday_2'].fillna('неизвестно', inplace=True)
users['user_district'].fillna('неизвестно', inplace=True)
users['user_region'].fillna('неизвестно', inplace=True)

In [34]:
grouped_attend

Unnamed: 0,user_id,group_id,weekday_1,weekday_2,delta,start_hour,finish_hour,number_of_lessons
0,101346549,801357282,Вт,неизвестно,2.0,16.0,18.0,1
1,101346549,801361690,Пн,неизвестно,2.0,11.0,13.0,1
2,101346549,801365191,Пн,Ср,1.0,11.0,12.0,2
3,101346549,801366199,Ср,неизвестно,2.0,15.0,17.0,1
4,101346549,801367532,Чт,неизвестно,1.0,18.0,19.0,2
...,...,...,...,...,...,...,...,...
600385,101449498,801368116,Пн,Ср,1.0,16.0,17.0,3
600386,101449549,801346695,Пн,неизвестно,1.0,12.0,13.0,1
600387,101449549,801351787,Пн,неизвестно,1.0,11.0,12.0,1
600388,101449549,801367908,Пн,Чт,1.0,18.0,19.0,2


In [35]:
groups.to_csv('research/processed_data_2_stage/groups.csv', sep=';', index=False)
users.to_csv('research/processed_data_2_stage/users.csv', sep=';', index=False)
grouped_attend.to_csv('research/processed_data_2_stage/grouped_attend.csv', sep=';', index=False)
attend.to_csv('research/processed_data_2_stage/attend.csv', sep=';', index=False)

## Feature engineering

In [36]:
groups = pd.read_csv('research/processed_data_2_stage/groups.csv', sep=';')
users = pd.read_csv('research/processed_data_2_stage/users.csv', sep=';')
attend = pd.read_csv('research/processed_data_2_stage/attend.csv', sep=';')
grouped_attend = pd.read_csv('research/processed_data_2_stage/grouped_attend.csv', sep=';')

In [37]:
users = users.sort_values(by='user_id').reset_index(drop=True)

In [38]:
users[['pop_start_hour','pop_finish_hour']] = \
    round(attend.groupby('user_id')[['start_hour', 'finish_hour']].mean(), 1).values

In [39]:
users[['prefer_weekday_1','prefer_weekday_2']] = \
    grouped_attend.groupby('user_id')[['weekday_1', 'weekday_2']].agg(lambda x:x.value_counts().index[0]).values

In [40]:
grouped_attend = grouped_attend[grouped_attend['group_id'].isin(groups['group_id'])].reset_index(drop=True)

In [41]:
groups = groups.merge(round(grouped_attend.merge(users, on='user_id', how='left')[['group_id', 'sex',
                                                             'age',
                                                             'active_in_months',
                                                             'active_in_years']].groupby('group_id', as_index=False).mean(), 2).rename(columns={'sex': 'mean_sex',
                                                                                                                                                'age': 'mean_age', 'active_in_months': 'mean_act_in_m', 'active_in_years': 'mean_act_in_y'}), how='left', on='group_id')

In [42]:
groups.fillna({"mean_sex": groups['mean_sex'].mean(),
               "mean_age": groups['mean_age'].mean(),
               "mean_act_in_m": groups['mean_act_in_m'].mean(),
               "mean_act_in_y": groups['mean_act_in_y'].mean()}, inplace=True)

### name and history

In [54]:
preferences = attend[attend.user_id.isin(users.user_id)].merge(groups[['group_id', 'name']], on='group_id').rename(columns={'name': 'history'}).groupby('user_id')['history'].apply(lambda x: list(set(x))[:10]).reset_index()

users = users.merge(preferences, how='left', on='user_id')
users['history'] = users['history'].apply(lambda x: ', '.join(x) if type(x) == list else x)

TypeError: can only join an iterable

In [65]:
rn = RussianNames(count=users.shape[0], patronymic=False)

In [68]:
users['name'] = rn.get_batch()

In [70]:
groups.to_csv('research/processed_data_2_stage/groups.csv', sep=';', index=False)
users.to_csv('research/processed_data_2_stage/users.csv', sep=';', index=False)
grouped_attend.to_csv('research/processed_data_2_stage/grouped_attend.csv', sep=';', index=False)
attend.to_csv('research/processed_data_2_stage/attend.csv', sep=';', index=False)

## Model

In [8]:
groups = pd.read_csv('research/processed_data_2_stage/groups.csv', sep=';')
users = pd.read_csv('research/processed_data_2_stage/users.csv', sep=';')
attend = pd.read_csv('research/processed_data_2_stage/attend.csv', sep=';')
grouped_attend = pd.read_csv('research/processed_data_2_stage/grouped_attend.csv', sep=';')
test = pd.read_csv('research/raw_data/test.csv')

In [72]:
def apk(actual, predicted, k=10):
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [4]:
attend = attend[attend['user_id'].isin(users['user_id'])]
grouped_attend = grouped_attend[grouped_attend['user_id'].isin(users['user_id'])]

In [74]:
cat_features = ['weekday_1', 'weekday_2',
                'user_district', 'user_region',
                'prefer_weekday_1',
                'prefer_weekday_2',
                'category 1',
                'category 2', 'name', 'district',
                'region', 'marker', 'topic']

cat_u_features = ['user_district', 'user_region',
                  'prefer_weekday_1',
                  'prefer_weekday_2',]
cat_g_features = ['category 1',
                  'category 2', 'name', 'district',
                  'region', 'marker', 'weekday_1', 'weekday_2', 'topic']

In [75]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [76]:
train = grouped_attend.merge(users, how='left', on='user_id')\
    .merge(groups.drop(columns=['weekday_1', 'weekday_2', 'delta', 'start_hour', 'finish_hour']), how='left', on='group_id')

In [77]:
dummy_groups = pd.concat([pd.get_dummies(groups[cat_g_features]), groups.drop(columns=cat_g_features)], axis=1)
dummy_users = pd.concat([pd.get_dummies(users[cat_u_features]), users.drop(columns=cat_u_features)], axis=1)

scaler_g = StandardScaler()
scaler_u = StandardScaler()

pca_users = scaler_u.fit_transform(dummy_users.drop(columns=['user_id']))
pca_groups = scaler_g.fit_transform(dummy_groups.drop(columns=['group_id']))

pca_g = PCA(n_components=380)
pca_u = PCA(n_components=100)

pca_users = pca_u.fit_transform(pca_users)
pca_groups = pca_g.fit_transform(pca_groups)

ValueError: could not convert string to float: 'московский театрал, ландшафтный дизайн, суставная гимнастика, кулинарные курсы, физкультурно-оздоровительные занятия, история, культура россии, акварельная живопись, здоровая спина, литература, авторские курсы/маршруты'

In [None]:
pca_groups = pd.DataFrame(pca_groups, columns=[f'latent_{i}' for i in range(pca_groups.shape[1])])
pca_users = pd.DataFrame(pca_users, columns=[f'latent_{i}' for i in range(pca_users.shape[1])])
pca_groups['group_id'] = dummy_groups['group_id']
pca_users['user_id'] = dummy_users['user_id']

In [None]:
neigh_groups = NearestNeighbors(n_neighbors=5, radius=0.5)
neigh_groups.fit(pca_groups.drop(columns=['group_id']))

neigh_users = NearestNeighbors(n_neighbors=5, radius=0.5)
neigh_users.fit(pca_users.drop(columns=['user_id']))

In [None]:
def get_similar_items(user_items):
    x = pca_groups[pca_groups.group_id.isin(user_items)].drop(columns=['group_id'])
    indexes = neigh_groups.kneighbors(x, return_distance=False).reshape(1, -1).tolist()[0]
    similar_items = pca_groups.iloc[indexes]['group_id'].drop_duplicates().values
    return similar_items

def get_similar_users(user):
    x = pca_users[pca_users.user_id.isin([user])].drop(columns=['user_id'])
    indexes = neigh_users.kneighbors(x, return_distance=False).reshape(1, -1).tolist()[0]
    similar_users = pca_users.iloc[indexes]['user_id'].drop_duplicates().values
    return similar_users

In [None]:
reader = Reader(rating_scale=(1, 94))

data = Dataset.load_from_df(train[['user_id', 'group_id', 'number_of_lessons']], reader)
# test set is made of 25% of the ratings.

In [None]:
trainSet = data.build_full_trainset()
algo = SVD(n_factors=2000, random_state=1)
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
algo.fit(trainSet)

In [None]:
popular_groups = list(train[train['group_id'].isin(group_to_recommend.values.reshape(1, -1).tolist()[0])].groupby('group_id')['user_id'].count()[:10].index)

In [None]:
def predict_for_users(users, exclude_user_items=False):
    predicted_groups = []
    fillValue = trainSet.global_mean

    model_items = [trainSet.to_raw_iid(item) for item in trainSet.all_items()]
    for uid in tqdm(users):
        if uid not in [trainSet.to_raw_uid(uid) for uid in trainSet.all_users()]:
            predictions = popular_groups
        else:
            uid = trainSet.to_inner_uid(uid)
            objects_to_predict = []
            user_item_ratings = trainSet.ur[uid]
            user_items = [item for (item,_) in (user_item_ratings)]

            similar_items = get_similar_items([trainSet.to_raw_iid(item_id) for item_id in user_items])
            similar_items = [trainSet.to_inner_iid(item_id) for item_id in similar_items if item_id in model_items]
            for iid in similar_items:
                # get a prediction for specific users and items.
                if exclude_user_items:
                    if iid not in user_items:
                        objects_to_predict.append((trainSet.to_raw_uid(uid), trainSet.to_raw_iid(iid), fillValue))
                else:
                    objects_to_predict.append((trainSet.to_raw_uid(uid), trainSet.to_raw_iid(iid), fillValue))
            predictions = algo.test(objects_to_predict)
            predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
            predictions = [value.iid for value in predictions[:10]]
        predicted_groups.append(predictions)
    return predicted_groups

predicted_train_groups = predict_for_users([trainSet.to_raw_uid(uid) for uid in trainSet.all_users()[:1000]])

In [78]:
actual_train_groups = grouped_attend.groupby('user_id')['group_id'].apply(list)
actual_train_groups = list(actual_train_groups.values)[:1000]

In [79]:
mapk(actual_train_groups, predicted_train_groups, k=10)

NameError: name 'predicted_train_groups' is not defined

In [121]:
train[['user_id', 'group_id', 'number_of_lessons']].to_csv('research/model_data/merged_historical_data.csv', sep=';', index=False)
pca_users.to_csv('research/model_data/pca_users.csv', sep=';', index=False)
pca_groups.to_csv('research/model_data/pca_groups.csv', sep=';', index=False)

In [122]:
import pickle

with open("research/models/SVD.pkl", "wb") as f:
    pickle.dump(algo, f)