In [1]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from typing import Tuple, Union, Optional, List, Dict, Set
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from numpy.core.multiarray import ndarray
from scipy.sparse import coo_matrix
from ml_metrics import mapk
from tqdm import tqdm
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import GroupShuffleSplit
import lightgbm as lgb
from sklearn import preprocessing

In [2]:
def get_target(data: pd.DataFrame) -> Dict[int, List[int]]:
    target = dict()
    for row in data.itertuples():
        target[int(row.user_id)] = [int(row.course_id)]
    return target

def train_val_split(dataset: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[int, List[int]]]:
    X = dataset.copy()
    X.reset_index(inplace=True)
    X.sort_values(["user_id", "index"], inplace=True)
    
    X["order"] = 1
    X["order"] = X.groupby("user_id")["order"].cumsum()
    
    max_order = X.groupby("user_id", as_index=False)["order"].max()
    max_order["is_max_order"] = 1
    
    X = X.merge(max_order, "left", on=["user_id", "order"])
    X["is_max_order"] = X["is_max_order"].fillna(0).astype(int)
    
    courses_in_train = X.loc[(X["is_max_order"] == 0) | ((X["is_max_order"] == 1) & (X["order"] == 1)), "course_id"].unique()
    
    mask = (X["is_max_order"] == 1) & (X["order"] != 1) & (X["course_id"].isin(courses_in_train))
    
    X_train = X.loc[~mask, ['user_id','course_id']].copy()
    y_train = get_target(X.loc[mask, ['user_id','course_id']])
    return X_train, y_train

def transform_with_transformation_inplace(column, data, transformation):
    _drop_non_transformed_rows(column, data, transformation)

    non_null_values_mask = ~data[column].isnull().values
    non_null_values = data.loc[non_null_values_mask, column].values.astype(int, copy=False)
    transformed = transformation.values[np.searchsorted(transformation.index.values, non_null_values)]
    del non_null_values
    gc.collect()

    data.loc[non_null_values_mask, column] = transformed
    del non_null_values_mask
    gc.collect()

    data[column] = data[column].astype(int, errors="ignore")


def _drop_non_transformed_rows(column, data, transformation):
    not_transformed = ~fast_isin_for_sorted_test_elements(data[column], transformation.index)
    data.drop(index=data.index[~pd.isnull(data[column]) & not_transformed], inplace=True)


def fast_isin_for_sorted_test_elements(elements: np.ndarray, sorted_test_elements: Union[np.ndarray, pd.Series]) -> np.ndarray:
    if isinstance(sorted_test_elements, pd.Series):
        sorted_test_elements = sorted_test_elements.values

    if sorted_test_elements.size == 0:
        return np.zeros(elements.size, dtype=np.bool)
    if sorted_test_elements.size == 1:
        return elements == sorted_test_elements[0]
    ss_result_left = np.searchsorted(sorted_test_elements, elements, side="left")
    ss_result_left.clip(max=len(sorted_test_elements) - 1, out=ss_result_left)
    result = elements == sorted_test_elements[ss_result_left]
    return result


def get_id_transformations(data: pd.DataFrame, column: str) -> Tuple[pd.Series, pd.Series]:
    """E.g.
        data = pd.DataFrame({'id': [10, None, 30], 'value': [0.1, 0.3, 0.2]})
        column = 'id'

        Returns:
            pd.Series([10, 30, NONE_CONSTANT], index=[0, 1, 2], name='id')
        pd.Series([0, 1, 2], index=[10, 30, NONE_CONSTANT], name='id')
    """
    add_nan = False
    values = data[column].unique()

    if any(pd.isnull(values)):
        values = values[~pd.isnull(values)]
        add_nan = True

    index = np.arange(len(values))
    transformation = pd.Series(values, index=index, name="_" + column)

    if add_nan:
        transformation = transformation.append(pd.Series([NONE_CONSTANT], index=[transformation.index.max() + 1]))

    revert_transformation = pd.Series(transformation.index, index=transformation.values, name="_" + column)
    revert_transformation.sort_index(inplace=True)

    return transformation, revert_transformation


def get_course_user_matrix(
    products: pd.Series, users: pd.Series, matrix_shape: Tuple[int, int], max_value: Optional[int] = None
) -> coo_matrix:
    if len(products) != len(users):
        raise ValueError("Series with products and users must be the same length")

    data = np.ones(products.shape[0])
    matrix = coo_matrix((data, (products, users)), shape=matrix_shape, dtype=np.float32)
    if max_value is not None:
        matrix = matrix.tocsr()
        matrix.data = matrix.data.clip(max=max_value)
        matrix = matrix.tocoo()
    return matrix

def get_recommendations(model: Union[CosineRecommender, TFIDFRecommender], product_user_matrix: coo_matrix, user_ids_to_predict: ndarray, user_id_to_compressed_user_id: pd.Series, compressed_product_id_to_product_id: pd.Series, n: int = 3) -> List[ndarray]:
    recommendations = list()
    user_product_matrix = product_user_matrix.T.tocsr()

    for user_id, transformed_user_id in tqdm(zip(user_ids_to_predict, user_id_to_compressed_user_id[user_ids_to_predict].values)):
        recommendation = model.recommend(userid=transformed_user_id, user_items=user_product_matrix, N=n, recalculate_user=False)
        recommendation = [id for (id, score) in recommendation]
        recommendation = compressed_product_id_to_product_id[np.array(recommendation)].values.tolist()
        recommendations.append(recommendation)
    return recommendations

def add_missing_recommendations(reco: List[int], reco_2: List[int]) -> List[int]:
    for i in reco_2:
        if len(reco) == 3:
            break
        if i in reco:
            continue
        reco += [i]
    return reco

def get_submission(test: pd.DataFrame) -> pd.DataFrame:
    submission = pd.DataFrame({"Id": test["user_id"].values})
    recommendations_as_str = list()

    for row in test.itertuples():
        if row.Predicted is np.nan:
            recommendations_as_str.append("7 1 15")
        else:
            recommendations_as_str.append(" ".join([str(i) for i in row.Predicted]))
    submission["Predicted"] = recommendations_as_str
    return submission

def convert_to_datetime(creation_datetime: str) -> datetime:
    if "\ufeff" in creation_datetime:
        creation_datetime = creation_datetime.replace("\ufeff", "")
    return datetime.strptime(creation_datetime, '%d-%m-%Y %H:%M')

In [3]:
N_RECO = 50

In [4]:
assessment_train = pd.read_csv("assessment_train.csv", index_col=0)
course = pd.read_csv("course.csv", delimiter=";") 
lessons_homework_train = pd.read_csv("lessons_homework_train.csv", index_col=0)
user_course_train = pd.read_csv("user_course_train.csv", index_col=0)
test = pd.read_csv("test_ids.txt", header=None, names=["user_id"])

In [5]:
columns = [
    "id",
    "salary",
    "enabled",
    "shortname",
    "visible",
    "assessment_enabled",
    "status",
    "is_partner",
    "is_success_experiment",
    "created_date",
    "is_specialization"]

In [6]:
course_features = course[columns].copy()

In [7]:
course_features[["salary", "enabled"]] = course_features[["salary", "enabled"]].fillna(-1).astype(int)
course_features["shortname"] = course_features["shortname"].fillna("Unknown")
course_features["created_date"] = course_features["created_date"].fillna("2022-01-01")

course_features["created_date"] = pd.to_datetime(course_features["created_date"])
course_features["created_date"] = (pd.datetime(2021, 5, 15) - course_features["created_date"]).dt.days

le = preprocessing.LabelEncoder()
course_features["shortname"] = le.fit_transform(course_features["shortname"].values)
course_features["status"] = le.fit_transform(course_features["status"].values)
course_features.rename(columns={"id": "course_id"}, inplace=True)

  course_features["created_date"] = (pd.datetime(2021, 5, 15) - course_features["created_date"]).dt.days


In [8]:
user_features = user_course_train[["user_id", "created"]].copy()

In [9]:
user_features["created"] = user_features["created"].str.replace(".", "-")
user_features["created"] = user_features["created"].apply(convert_to_datetime)
user_features["created"] = (pd.datetime(2021, 5, 15) - user_features["created"]).dt.days

  user_features["created"] = (pd.datetime(2021, 5, 15) - user_features["created"]).dt.days


In [10]:
user_features = user_features.groupby(["user_id"], as_index=False).agg({"created": ["min", "max"]})
user_features.columns = ["user_id", "first_date", "last_date"]

In [11]:
X = user_course_train[['user_id','course_id']].copy().drop_duplicates()

In [12]:
assessment_ids = assessment_train["user_id"].unique()
user_course_train_ids = user_course_train["user_id"].unique()
test_ids = test["user_id"].unique()

In [13]:
assessment_ids = np.intersect1d(assessment_ids, test_ids)
group_1 = np.intersect1d(user_course_train_ids, test_ids)
group_2 = assessment_ids[~np.isin(assessment_ids, group_1)]
group_3 = test_ids[~np.isin(test_ids, np.union1d(group_1, group_2))]

# Рекомендации для пользователей, по которым есть информация по оплате и оценкам

### Обучение

In [14]:
# Train
X_train, y_val = train_val_split(X)

n_users = X_train["user_id"].nunique()
n_courses = X_train["course_id"].nunique()

compressed_user_id_to_user_id, user_id_to_compressed_user_id = get_id_transformations(X_train, "user_id")
compressed_course_id_to_course_id, course_id_to_compressed_course_id = get_id_transformations(X_train, "course_id")

transform_with_transformation_inplace("user_id", X_train, user_id_to_compressed_user_id)
transform_with_transformation_inplace("course_id", X_train, course_id_to_compressed_course_id)

course_user_matrix_shape = (len(compressed_course_id_to_course_id), len(compressed_user_id_to_user_id))
course_user_matrix = get_course_user_matrix(X_train["course_id"], X_train["user_id"], course_user_matrix_shape)

model_1 = TFIDFRecommender(K=150)
model_1.fit(course_user_matrix)

model_2 = CosineRecommender(K=150)
model_2.fit(course_user_matrix)

actual = [course for course in y_val.values()]
user_ids_to_predict = np.array([user_id for user_id in y_val.keys()])
predicted_1_1 = get_recommendations(model_1, course_user_matrix, user_ids_to_predict, user_id_to_compressed_user_id, compressed_course_id_to_course_id, N_RECO)
predicted_1_2 = get_recommendations(model_2, course_user_matrix, user_ids_to_predict, user_id_to_compressed_user_id, compressed_course_id_to_course_id, N_RECO)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=173.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=173.0), HTML(value='')))

148it [00:00, 1423.74it/s]




3017it [00:01, 2543.31it/s]
3017it [00:01, 2469.76it/s]


In [15]:
mapk(actual, predicted_1_1, 3)

0.1851729090708209

In [16]:
mapk(actual, predicted_1_2, 3)

0.1858358192464921

In [17]:
candidates = list()

for i, j in zip(predicted_1_1, predicted_1_2):
    all_candidates = list(set(i) | set(j))
    if len(all_candidates) < 3:
        for wtf in [7, 1, 15]:
            if wtf in all_candidates:
                continue
            if len(all_candidates) == 3:
                break
            all_candidates += [wtf]
    candidates.append(all_candidates)

In [18]:
minn = 10
for i in candidates:
    minn = min(len(i), minn)
print(minn)

3


In [19]:
def get_recommendations_2(model_1: CosineRecommender, model_2: TFIDFRecommender, product_user_matrix: coo_matrix, user_ids_to_predict: ndarray, candidates: List[List[int]], user_id_to_compressed_user_id: pd.Series, course_id_to_compressed_course_id: pd.Series, compressed_course_id_to_course_id: pd.Series) -> List[ndarray]:
    result_1 = list()
    result_2 = list()
    user_product_matrix = product_user_matrix.T.tocsr()
    print(user_product_matrix.shape)

    for user_id, transformed_user_id, cand in tqdm(zip(user_ids_to_predict, user_id_to_compressed_user_id[user_ids_to_predict].values, candidates)):
        selected_items = course_id_to_compressed_course_id[cand].values
        
        rank_score_1 = model_1.rank_items(userid=transformed_user_id, user_items=user_product_matrix, selected_items=selected_items, recalculate_user=False)
        reco_1 = [reco for (reco, score) in rank_score_1]
        reco_1 = compressed_course_id_to_course_id[reco_1].values
        rank_1 = np.arange(len(reco_1))
        score_1 = [score for (reco, score) in rank_score_1]
        
        rank_score_2 = model_2.rank_items(userid=transformed_user_id, user_items=user_product_matrix, selected_items=selected_items, recalculate_user=False)
        reco_2 = [reco for (reco, score) in rank_score_2]
        reco_2 = compressed_course_id_to_course_id[reco_2].values
        rank_2 = np.arange(len(reco_2))
        score_2 = [score for (reco, score) in rank_score_2]
        
        result_1.append(pd.DataFrame({"user_id": user_id, "course_id": reco_1, "rank_1": rank_1, "score_1": score_1}))
        result_2.append(pd.DataFrame({"user_id": user_id, "course_id": reco_2, "rank_2": rank_2, "score_2": score_2}))
    return result_1, result_2

In [20]:
result_1, result_2 = get_recommendations_2(model_1, model_2, course_user_matrix, user_ids_to_predict, candidates, user_id_to_compressed_user_id, course_id_to_compressed_course_id, compressed_course_id_to_course_id)

15it [00:00, 139.04it/s]

(5830, 173)


3017it [00:18, 160.26it/s]


In [21]:
result_1 = pd.concat(result_1, ignore_index=True)
result_2 = pd.concat(result_2, ignore_index=True)
result = result_1.merge(result_2, "left", ["user_id", "course_id"])

In [22]:
assert np.intersect1d(result_1["user_id"].unique(), user_ids_to_predict).shape[0] == 3017
assert np.intersect1d(result_2["user_id"].unique(), user_ids_to_predict).shape[0] == 3017

In [23]:
result["mean_rank"] = result[["rank_1", "rank_2"]].mean(axis=1)
result = result.sort_values(["user_id", "mean_rank"]).groupby("user_id", as_index=False).head(3)
result = result.groupby("user_id", as_index=False).agg({"course_id": lambda x: list(x.values)})

res = list()
for row in result.itertuples():
    res.append(row.course_id)
    
mapk(actual, res, 3)

0.18456524140978894

### Предсказание

In [24]:
X = user_course_train[['user_id','course_id']].copy().drop_duplicates()

In [25]:
n_users = X["user_id"].nunique()
n_courses = X["course_id"].nunique()

compressed_user_id_to_user_id, user_id_to_compressed_user_id = get_id_transformations(X, "user_id")
compressed_course_id_to_course_id, course_id_to_compressed_course_id = get_id_transformations(X, "course_id")

transform_with_transformation_inplace("user_id", X, user_id_to_compressed_user_id)
transform_with_transformation_inplace("course_id", X, course_id_to_compressed_course_id)

course_user_matrix_shape = (len(compressed_course_id_to_course_id), len(compressed_user_id_to_user_id))
course_user_matrix = get_course_user_matrix(X["course_id"], X["user_id"], course_user_matrix_shape)

model_1 = TFIDFRecommender(K=150)
model_1.fit(course_user_matrix)

model_2 = CosineRecommender(K=150)
model_2.fit(course_user_matrix)

predicted_1_1 = get_recommendations(model_1, course_user_matrix, group_1, user_id_to_compressed_user_id, compressed_course_id_to_course_id, N_RECO)
predicted_1_2 = get_recommendations(model_2, course_user_matrix, group_1, user_id_to_compressed_user_id, compressed_course_id_to_course_id, N_RECO)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=173.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=173.0), HTML(value='')))

173it [00:00, 1713.93it/s]




803it [00:00, 2396.20it/s]
803it [00:00, 2749.55it/s]


In [26]:
candidates = list()

for i, j in zip(predicted_1_1, predicted_1_2):
    all_candidates = list(set(i) | set(j))
    if len(all_candidates) < 3:
        for wtf in [7, 1, 15]:
            if wtf in all_candidates:
                continue
            if len(all_candidates) == 3:
                break
            all_candidates += [wtf]
    candidates.append(all_candidates)

In [27]:
minn = 10
for i in candidates:
    minn = min(len(i), minn)
print(minn)

10


In [28]:
result_1, result_2 = get_recommendations_2(model_1, model_2, course_user_matrix, group_1, candidates, user_id_to_compressed_user_id, course_id_to_compressed_course_id, compressed_course_id_to_course_id)

0it [00:00, ?it/s]

(5830, 173)


803it [00:04, 175.74it/s]


In [29]:
result_1 = pd.concat(result_1, ignore_index=True)
result_2 = pd.concat(result_2, ignore_index=True)
result = result_1.merge(result_2, "left", ["user_id", "course_id"])

In [30]:
assert np.intersect1d(result_1["user_id"].unique(), group_1).shape[0] == 803
assert np.intersect1d(result_2["user_id"].unique(), group_1).shape[0] == 803

In [31]:
result["mean_rank"] = result[["rank_1", "rank_2"]].mean(axis=1)
result = result.sort_values(["user_id", "mean_rank"]).groupby("user_id", as_index=False).head(3)
predicted_1 = result.groupby("user_id", as_index=False).agg({"course_id": lambda x: list(x.values)})
predicted_1.rename(columns={"course_id": "Predicted"}, inplace=True)

### Добавляем к ркомендациям оценки

In [32]:
group_1_with_assessment = assessment_train.loc[assessment_train["user_id"].isin(group_1)]["user_id"].unique()

In [33]:
dat_a = assessment_train.loc[assessment_train["user_id"].isin(group_1_with_assessment)].copy()
dat_a["course_id"] = dat_a["course_id"].astype(int)
dat_a = dat_a.groupby("user_id")["course_id"].apply(lambda x: set(x.values))
dat_c = user_course_train.loc[user_course_train["user_id"].isin(group_1_with_assessment)].groupby("user_id")["course_id"].apply(lambda x: set(x.values))

In [34]:
dat = dat_a - dat_c

In [35]:
predicted_1_1 = predicted_1.loc[predicted_1["user_id"].isin(group_1_with_assessment)].copy()

In [36]:
def add_missing_recommendations2(reco1: Set[int], reco2: List[int]) -> List[int]:
    reco = list(reco1)[:3]
    
    for i in reco2:
        if len(reco) == 3:
            break
        if i in reco:
            continue
        reco += [i]
    return reco

In [37]:
reco = list()

for i, j in zip(dat, predicted_1_1["Predicted"]):
    reco.append(add_missing_recommendations2(i, j))

In [38]:
predicted_1_2 = predicted_1.loc[predicted_1["user_id"].isin(group_1[~np.isin(group_1, group_1_with_assessment)])].copy()

In [39]:
predicted_1_1 = pd.DataFrame({"user_id": predicted_1_1["user_id"].values, "Predicted": reco})
predicted_1 = pd.concat((predicted_1_1, predicted_1_2), ignore_index=True)

## Рекомендации для пользователей, по которым есть информация только про оценки

In [40]:
X_2 = assessment_train.loc[assessment_train["user_id"].isin(group_2), ["user_id", "course_id"]].copy()
X_2.drop_duplicates(inplace=True)

In [41]:
X = user_course_train[['user_id','course_id']].copy().drop_duplicates()
X = pd.concat((X, X_2), ignore_index=True)
X["course_id"] = X["course_id"].astype(int)

In [42]:
n_users = X["user_id"].nunique()
n_courses = X["course_id"].nunique()

compressed_user_id_to_user_id, user_id_to_compressed_user_id = get_id_transformations(X, "user_id")
compressed_course_id_to_course_id, course_id_to_compressed_course_id = get_id_transformations(X, "course_id")

transform_with_transformation_inplace("user_id", X, user_id_to_compressed_user_id)
transform_with_transformation_inplace("course_id", X, course_id_to_compressed_course_id)

course_user_matrix_shape = (len(compressed_course_id_to_course_id), len(compressed_user_id_to_user_id))
course_user_matrix = get_course_user_matrix(X["course_id"], X["user_id"], course_user_matrix_shape)

model_1 = TFIDFRecommender(K=150)
model_1.fit(course_user_matrix)

# model_2 = CosineRecommender(K=150)
# model_2.fit(course_user_matrix)

predicted_2_1 = get_recommendations(model_1, course_user_matrix, group_2, user_id_to_compressed_user_id, compressed_course_id_to_course_id, 3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=174.0), HTML(value='')))

356it [00:00, 2254.53it/s]







In [43]:
X_2["order"] = 1
X_2["order"] = X_2.groupby("user_id")["order"].cumsum()
X_2 = X_2.loc[X_2["order"] <= 3].copy()
X_2["course_id"] = X_2["course_id"].astype(int)
X_2 = X_2.groupby("user_id")["course_id"].apply(lambda x: list(x.values)).reset_index()

In [44]:
assessment_ = list()
for row in X_2.itertuples():
    assessment_.append(row.course_id)

In [45]:
predicted_2 = list()
for i, j in zip(assessment_, predicted_2_1):
    predicted_2.append(add_missing_recommendations(i, j))

X_2.rename(columns={"course_id": "Predicted"}, inplace=True)

## Рекомендации для пользователей, по которым нет информации

In [46]:
prediction = pd.concat((predicted_1, X_2), ignore_index=True)

In [47]:
X = user_course_train[["user_id", "course_id", "created"]].drop_duplicates(subset="user_id").copy()
X["created"] = X["created"].str.replace(".", "-")
X["created"] = X["created"].apply(convert_to_datetime)
X["date"] = X["created"].dt.date

In [48]:
a = X.copy()
a["cnt"] = 1
a = a.pivot_table(values="cnt", index="date", columns="course_id", aggfunc="sum", fill_value=0, dropna=False)
a = a.reset_index()
a["date"] = pd.to_datetime(a["date"])

In [49]:
b = pd.DataFrame({"date": pd.date_range('2018-10-10', periods=888, freq='D')})
b = b.merge(a, "left", "date")
b = b.fillna(0)

In [50]:
b = b.rolling(5, min_periods=0).sum()

In [51]:
b["date"] = pd.DataFrame({"date": pd.date_range('2018-10-10', periods=888, freq='D')})
b = b.set_index("date")
b = b.rank(axis=1, method="min")

In [52]:
course_popularity = list()

In [53]:
for col in b.T.columns:
    top_3 = b.T.nlargest(3, col).index.values.tolist()
    course_popularity.append(top_3) # " ".join([str(i) for i in top_3])

In [54]:
course_popularity = pd.DataFrame({"date": b.T.columns.values, "Predicted": course_popularity})

In [55]:
X = X.groupby("date").agg({"user_id": "max"})
X["cum"] = X["user_id"].cummax()
X.drop(columns="user_id", inplace=True)
X = X.reset_index()

In [56]:
c = pd.DataFrame({"date": np.nan, "cum": group_3})

In [57]:
c = pd.concat((X, c), ignore_index=True)
c.sort_values("cum", inplace=True)
c["date"] = c["date"].fillna(method='backfill')
c["date"] = pd.to_datetime(c["date"])

In [58]:
c = c.merge(course_popularity, "left", "date")
c.rename(columns={"cum": "user_id"}, inplace=True)
c = c.loc[c["user_id"].isin(group_3)].copy()
c.drop(columns="date", inplace=True)

In [59]:
prediction = pd.concat((prediction, c), ignore_index=True)

In [60]:
test = test.merge(prediction, "left", "user_id")

In [61]:
submission = get_submission(test) 
submission.to_csv("submission_25.csv", index=False)