In [21]:
import sys
import warnings

import pandas as pd
import numpy as np

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [22]:
# If the library is not installed - unlock the field (for download from google disk)
!{sys.executable} -m pip install gdown

[0m

In [23]:
# Download lides_data.zip
!gdown --id 18tjWtz1dDj0tW7o1NUNmDeyiLyGYQOtt

Downloading...
From: https://drive.google.com/uc?id=18tjWtz1dDj0tW7o1NUNmDeyiLyGYQOtt
To: /kaggle/working/hidden_task.zip
100%|██████████████████████████████████████| 35.6k/35.6k [00:00<00:00, 31.0MB/s]


In [24]:
!unzip hidden_task.zip

Archive:  hidden_task.zip
  inflating: hostid_url.tsv          
  inflating: qid_query.tsv           
  inflating: qid_url_rating.tsv      


# PFound


### 1.Стандартное решение от Yandex

Исходные данные - Yandex Cup 2022 Analytics
Задача - написать функцию, которая принимает на вход dataframe (после join), а на выходе дает средний pFound по всем query.
Запрещается использовать циклы for для расчет метрики (как полностью, так и ее частей).
Усложнение, если задача показалась легкой - попробуйте обойтись без groupby (не уверен, что это возможно, но вдруг вы справитесь)

In [25]:
# считываем данные
qid_query = pd.read_csv("qid_query.tsv", sep="\t", names=["qid", "query"])
qid_url_rating = pd.read_csv("qid_url_rating.tsv", sep="\t", names=["qid", "url", "rating"])
hostid_url = pd.read_csv("hostid_url.tsv", sep="\t", names=["hostid", "url"])

# делаем join двух таблиц, чтобы было просто брать url с максимальным рейтингом
qid_url_rating_hostid = pd.merge(qid_url_rating, hostid_url, on="url")

In [26]:
qid_url_rating_hostid.head()

Unnamed: 0,qid,url,rating,hostid
0,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
1,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
2,402111,http://802351.info/5964-v-avstralii.html,0.0,13
3,402111,http://auscommunity.com/blog/jobs/,0.0,53
4,402111,http://auscommunity.com/tag/%D1%84%D0%BE%D1%82...,0.0,53


In [27]:
def plook(ind, rels):
    if ind == 0:
        return 1
    return plook(ind-1, rels)*(1-rels[ind-1])*(1-0.15)


def pfound(group):
    max_by_host = group.groupby("hostid")["rating"].max() # максимальный рейтинг хоста
    top10 = max_by_host.sort_values(ascending=False)[:10] # берем топ10 урлов с наивысшим рейтингом
    pfound = 0
    for ind, val in enumerate(top10):
        pfound += val*plook(ind, top10.values)
    return pfound

In [28]:
%%time
qid_pfound = qid_url_rating_hostid.groupby('qid').apply(pfound)

# группируем по qid и вычисляем pfound

pfound_mean = qid_pfound.mean()
print(f'Mean_Pfound (Yandex Solutinon): {pfound_mean}')



Mean_Pfound (Yandex Solutinon): 0.5822199638393889
CPU times: user 23.6 ms, sys: 3.94 ms, total: 27.5 ms
Wall time: 33.8 ms


### 2. Решение без цикла for


In [29]:
#Посмотрим наши данные
df = qid_url_rating_hostid[['qid', 'hostid', 'rating']]
df.head()

Unnamed: 0,qid,hostid,rating
0,402111,7,0.07
1,402111,7,0.07
2,402111,13,0.0
3,402111,53,0.0
4,402111,53,0.0


Получается, у нас есть юзер, и есть его запросы. нам нужны только 10 записей по пользователю которые были с максимальными рейтингами.
Все последующие записи можно не использовать.

In [30]:
df_sort = df.sort_values(by=['qid', 'rating'], ascending=False)

In [31]:
df_top10 = df_sort.groupby(['qid']).head(10)

In [32]:
df_top10.head(15)

Unnamed: 0,qid,hostid,rating
561,692308,1035,0.61
549,692308,551,0.41
569,692308,1155,0.41
537,692308,33,0.14
538,692308,70,0.14
541,692308,259,0.14
542,692308,259,0.14
547,692308,392,0.14
548,692308,393,0.14
552,692308,617,0.14


In [33]:
# Создаем ранг внутри юзера
df_top10['rank'] = df_top10.groupby(['qid']).cumcount()

In [34]:
df_top10.head(15)

Unnamed: 0,qid,hostid,rating,rank
561,692308,1035,0.61,0
549,692308,551,0.41,1
569,692308,1155,0.41,2
537,692308,33,0.14,3
538,692308,70,0.14,4
541,692308,259,0.14,5
542,692308,259,0.14,6
547,692308,392,0.14,7
548,692308,393,0.14,8
552,692308,617,0.14,9


Создадим коэффициенты pBreak, 1 - rels[ ind - 1 ]
Коэфф pBreak зависит от позиции в рейтинге для User, для первого итема 1,для всех остальных - 0.85
Коэфф 1 - rels[ ind - 1 ] зависит от рейтинга, который был у товара

In [35]:
df_top10['pBreak'] = np.where(df_top10['rank'] == 0, 1, 0.85)

In [36]:
df_top10.head()

Unnamed: 0,qid,hostid,rating,rank,pBreak
561,692308,1035,0.61,0,1.0
549,692308,551,0.41,1,0.85
569,692308,1155,0.41,2,0.85
537,692308,33,0.14,3,0.85
538,692308,70,0.14,4,0.85


In [37]:
df_top10['1_pred_rating'] = (1 - df_top10['rating']).shift(1)
df_top10['1_pred_rating'] = df_top10['1_pred_rating'].fillna(1)

In [38]:
df_top10.head()

Unnamed: 0,qid,hostid,rating,rank,pBreak,1_pred_rating
561,692308,1035,0.61,0,1.0,1.0
549,692308,551,0.41,1,0.85,0.39
569,692308,1155,0.41,2,0.85,0.59
537,692308,33,0.14,3,0.85,0.59
538,692308,70,0.14,4,0.85,0.86


Создадим фичу, в которой будет храниться произведение pBreak и 1_pred_rating
Дальше мы их перемножаем на все предыдущие значения для определенного юзера

In [39]:
df_top10['pBreak * 1_pred_rating'] = df_top10['pBreak'] * df_top10['1_pred_rating']

In [40]:
df_top10.head()

Unnamed: 0,qid,hostid,rating,rank,pBreak,1_pred_rating,pBreak * 1_pred_rating
561,692308,1035,0.61,0,1.0,1.0,1.0
549,692308,551,0.41,1,0.85,0.39,0.3315
569,692308,1155,0.41,2,0.85,0.59,0.5015
537,692308,33,0.14,3,0.85,0.59,0.5015
538,692308,70,0.14,4,0.85,0.86,0.731


In [41]:
# Cоздадим часть переменной Plook
df_top10['Plook'] = df_top10.groupby('qid')['pBreak * 1_pred_rating'].cumprod()

In [42]:
df_top10.head()

# Проверка для себя
# 1 * 0.3315 = 0.3315
# 1 * 0.3315 * 0.5015 = 0.166
# 0.166 * 0.5015 = 0.0833
#

Unnamed: 0,qid,hostid,rating,rank,pBreak,1_pred_rating,pBreak * 1_pred_rating,Plook
561,692308,1035,0.61,0,1.0,1.0,1.0,1.0
549,692308,551,0.41,1,0.85,0.39,0.3315,0.3315
569,692308,1155,0.41,2,0.85,0.59,0.5015,0.166247
537,692308,33,0.14,3,0.85,0.59,0.5015,0.083373
538,692308,70,0.14,4,0.85,0.86,0.731,0.060946


In [43]:
# Cоздадим Pfound
df_top10['Pfound'] =  df_top10['Plook'] * df_top10['rating']

df_top10.head()

Unnamed: 0,qid,hostid,rating,rank,pBreak,1_pred_rating,pBreak * 1_pred_rating,Plook,Pfound
561,692308,1035,0.61,0,1.0,1.0,1.0,1.0,0.61
549,692308,551,0.41,1,0.85,0.39,0.3315,0.3315,0.135915
569,692308,1155,0.41,2,0.85,0.59,0.5015,0.166247,0.068161
537,692308,33,0.14,3,0.85,0.59,0.5015,0.083373,0.011672
538,692308,70,0.14,4,0.85,0.86,0.731,0.060946,0.008532


In [44]:
pfound_mean = df_top10.groupby('qid')["Pfound"].sum().mean()
print(f'Mean_Pfound (Our Solution): {pfound_mean}')

Mean_Pfound (Our Solution): 0.5436870277272979


Обернем наш Pfound в функцию. Увеличим ее производтельность, принятием на вход разные pBreak, и top_K user

In [45]:
def pfound(df: pd.DataFrame, pB: float=0.85, top_k: int=10) -> float:
    """
    This function calculate meanPfound
    DataFrame must include features : 'qid, hostid, rating'
    :param df: pdDataFrame - dataframe.
    :param pB: float - value for pBreak
    :param top_k: int - amount best scores by user
    :return: meanPfound for data
    """

    check_1 = ('qid' in df.columns,
               'Column qid NotFound in df'
              )
    check_2 = ('hostid' in df.columns,
               'Column hostid NotFound in df'
              )
    check_3 = ('rating' in df.columns,
               'Column rating NotFound in df'
              )

    assert check_1
    assert check_2
    assert check_3


    df = df.sort_values(by=['qid', 'rating'], ascending=False)
    df = df_sort.groupby(['qid']).head(top_k)
    df['rank'] = df.groupby(['qid']).cumcount()
    df['pBreak'] = np.where(df['rank'] == 0, 1, pB)
    df['1_pred_rating'] = (1 - df['rating']).shift(1)
    df['1_pred_rating'] = df['1_pred_rating'].fillna(1)
    df['pBreak * 1_pred_rating'] = df['pBreak'] * df['1_pred_rating']
    df['Plook'] = df.groupby('qid')['pBreak * 1_pred_rating'].cumprod()
    df['Pfound'] =  df['Plook'] * df['rating']

    pfound_mean = df.groupby('qid')["Pfound"].sum().mean()

    return pfound_mean


In [46]:
%%time

result = pfound(qid_url_rating_hostid)
print(f'Mean_Pfound (Our Solution): {result}')


Mean_Pfound (Our Solution): 0.5436870277272979
CPU times: user 14.3 ms, sys: 34 µs, total: 14.3 ms
Wall time: 13.6 ms
