Model popularnościowy.

Wszytskie produkty w sklepie oceniane są zgodnie z metryką opartą na popularności (wzorujemy się na metryce wykorzystywanej w serwisie IMDB).

In [14]:
import pandas as pd
import numpy as np

In [15]:
def calculate_raiting(raiting, popularity, minPopularity, avgRaiting) -> pd.Series:
    return ( (popularity / (popularity + minPopularity)) * raiting) + ( (minPopularity / (popularity + minPopularity)) * avgRaiting )

def assigne_raiting(productsDF : pd.DataFrame) -> pd.Series:
    avgRaiting = productsDF['user_rating'].mean()
    minPopularity = np.percentile(productsDF['count'], 80)
    #deep copy because it is modified and returned as result
    products = productsDF[productsDF['count'] >= minPopularity].copy(deep=True)
    userRaitings = products['user_rating']
    popularity = products['count']

    products['score'] = calculate_raiting(userRaitings, popularity, minPopularity, avgRaiting)
    return products

def preprocess_data(productsDF : pd.DataFrame, sessionsDF : pd.DataFrame) -> pd.DataFrame:
    popularity = sessionsDF['product_id'].value_counts().rename_axis('product_id').reset_index(name='count')
    products = productsDF.drop(columns=['product_name','category_path','price'])
    return pd.merge(products, popularity, how='inner', on='product_id')


In [16]:
productsDataPath = '../notebooks/data/v2/products.jsonl'
sessionsDataPath = '../notebooks/data/v2/sessions.jsonl'

sessionsDF = pd.read_json(sessionsDataPath, lines=True)
productsDF = pd.read_json(productsDataPath, lines=True)

In [17]:
raiting = assigne_raiting(preprocess_data(productsDF, sessionsDF))
raiting.sort_values('score', ascending=False)

Unnamed: 0,product_id,user_rating,count,score
49,1050,4.959925,8849,4.870842
53,1054,4.628316,2903,4.413290
76,1077,4.822859,1182,4.322345
52,1053,4.378637,9644,4.317473
315,1316,4.287721,1351,3.948933
...,...,...,...,...
78,1079,0.695480,1005,1.215417
305,1306,0.501814,955,1.094410
12,1013,0.166660,848,0.911414
54,1055,0.518008,3687,0.708988


Budowanie profilu uzytkowników

In [20]:
separator = ';'
newGroups = ['Gry komputerowe', 'Gry na konsole', 'Sprzęt RTV', 'Komputery', 'Telefony i akcesoria']

def castCategoryPath(categoryPath):
    categories = categoryPath.split(separator)
    foundGroups = [group for group in newGroups if group in categories]
    if len(foundGroups) != 1:
        raise RuntimeError('wrong group cast: {}'.format(foundGroups))
    return foundGroups[0]

In [50]:
test = raiting.merge(productsDF, how='left', on='product_id')
test['category_path'] = test['category_path'].apply(castCategoryPath)

test.drop(columns=['user_rating_x', 'count', 'score', 'price', 'user_rating_y', 'product_name'], inplace=True)
g = test.groupby(['category_path']).count().reset_index()
g['%'] = 100 * g['product_id']  / g['product_id'].sum()
g.drop(columns=['product_id'])
g

Unnamed: 0,category_path,product_id,%
0,Gry komputerowe,19,29.6875
1,Gry na konsole,14,21.875
2,Komputery,16,25.0
3,Sprzęt RTV,12,18.75
4,Telefony i akcesoria,3,4.6875
