In [17]:
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter

In [18]:
df = pd.read_csv("./coursera_sessions_train.txt", sep=";", header=None)
df.columns = ["watched", "bought"]

In [19]:
df.head()

Unnamed: 0,watched,bought
0,012345,
1,9101191112911,
2,161718192021,
3,2425262724,
4,343536343735363738393839,


In [20]:
def handle_row(x):
    if x == x:
        return list(map(int, filter(None, x.split(','))))
    else:
        return np.nan

In [21]:
df["watched"] = df.watched.apply(handle_row)

In [22]:
df["bought"] = df.bought.apply(handle_row)

In [23]:
df.head()

Unnamed: 0,watched,bought
0,"[0, 1, 2, 3, 4, 5]",
1,"[9, 10, 11, 9, 11, 12, 9, 11]",
2,"[16, 17, 18, 19, 20, 21]",
3,"[24, 25, 26, 27, 24]",
4,"[34, 35, 36, 34, 37, 35, 36, 37, 38, 39, 38, 39]",


In [24]:
cnt_watched = Counter(
     list(chain.from_iterable(
        df.watched.dropna().values
    ))
)

In [25]:
cnt_bought = Counter(
     list(chain.from_iterable(
        df.bought.dropna().values
    ))
)

In [26]:
df.head()

Unnamed: 0,watched,bought
0,"[0, 1, 2, 3, 4, 5]",
1,"[9, 10, 11, 9, 11, 12, 9, 11]",
2,"[16, 17, 18, 19, 20, 21]",
3,"[24, 25, 26, 27, 24]",
4,"[34, 35, 36, 34, 37, 35, 36, 37, 38, 39, 38, 39]",


In [38]:
def recommendation(arr, recommendor, k=1):
    rec = np.array(list(map(
            lambda x: recommendor[x], 
            arr
        )))
    k = min(k, len(rec))
    rec = rec.argsort()[-k:][::-1]
    return list(map(lambda x: arr[x], rec))

In [39]:
def prec_and_rec_at_k(watched, bought, recommendor, k):
    rec = recommendation(watched, recommendor, k)
    k = len(rec)
    right_pred = sum([recommend in rec for recommend in bought])
    return right_pred/k, right_pred/len(bought)

In [40]:
df.dropna(inplace=True)

In [48]:
def handle_k(k):
    df["precision_recall_watched"] = \
        df[["watched", "bought"]].apply(
            lambda x: prec_and_rec_at_k(x["watched"], x["bought"], 
                                        cnt_watched, 1),
            axis=1
        )
    df["precision_watched"] = df["precision_recall_watched"].apply(
        lambda x: x[0]
    )
    df["recall_watched"] = df["precision_recall_watched"].apply(
        lambda x: x[1]
    )
    return df["precision_watched"].mean(), df["recall_watched"].mean()

In [51]:
def results():
    res1, res2 = handle_k(1)
    res3, res4 = handle_k(5)
    return res1, res2, res3, res4

In [52]:
results()

(0.5171840354767184,
 0.4473807134907463,
 0.5171840354767184,
 0.4473807134907463)

In [72]:
lookat_res = [
    df["recall_1_lookat"].mean(),
    df["precision_1_lookat"].mean(),
    df["recall_5_lookat"].mean(),
    df["precision_5_lookat"].mean(),
]

In [73]:
buy_res = [
    df["recall_1_buy"].mean(),
    df["precision_1_buy"].mean(),
    df["recall_5_buy"].mean(),
    df["precision_5_buy"].mean(),
]

In [74]:
# Функция сохранения в файл ответа, представленного массивом
def save_answerArray(fname,array):
    with open(fname,"w") as fout:
        fout.write(" ".join([str(el) for el in array]))

In [75]:
results = list(map(lambda x: "{0:.2f}".format(x), lookat_res))

In [76]:
save_answerArray("test1", results)

In [77]:
results = list(map(lambda x: "{0:.2f}".format(x), buy_res))

In [78]:
save_answerArray("test2", results)