In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import fasttext
from fasttext import util
import numpy as np

In [2]:
orders = pd.read_csv('data/Orders.csv')
products = pd.read_csv('data/Products.csv')

In [3]:
products.head()

Unnamed: 0,Name,BarCode,Price,LoaltyCoeff,Point_id
0,Chocolate Sandwich Cookies,903304753514,21.6,0.99,6191
1,All-Seasons Salt,401309075522,25.9,0.58,6870
2,Robust Golden Unsweetened Oolong Tea,851892036179,8.47,0.66,406
3,Smart Ones Classic Favorites Mini Rigatoni Wit...,445454311328,13.53,0.77,5459
4,Green Chile Anytime Sauce,304753955703,22.73,0.24,4954


In [4]:
orders.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_last_order,product_id,reordered
0,2539329,1,1,2,8,,196,0
1,2539329,1,1,2,8,,14084,0
2,2539329,1,1,2,8,,12427,0
3,2539329,1,1,2,8,,26088,0
4,2539329,1,1,2,8,,26405,0


In [5]:
orders_product = orders.merge(products, left_on='product_id', left_index=False, right_index=True)

In [6]:
orders_product

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_last_order,product_id,reordered,Name,BarCode,Price,LoaltyCoeff,Point_id
0,2539329,1,1,2,8,,196,0,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
5,2398795,1,2,3,7,15.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
11,473747,1,3,3,12,21.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
16,2254736,1,4,4,7,29.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
21,431534,1,5,4,15,28.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31866630,1320836,202557,17,2,15,1.0,43553,1,Max Fresh Whitening Fluoride Toothpaste With M...,280170585114,4.35,0.28,7080
31866648,31526,202557,18,5,11,3.0,43553,1,Max Fresh Whitening Fluoride Toothpaste With M...,280170585114,4.35,0.28,7080
32011605,758936,203436,1,2,7,,42338,0,Italian Salad Dressing & Seasoning Mix Packets,603660604622,4.28,0.42,7364
32011621,2745165,203436,2,3,5,15.0,42338,1,Italian Salad Dressing & Seasoning Mix Packets,603660604622,4.28,0.42,7364


In [7]:
products.head()

Unnamed: 0,Name,BarCode,Price,LoaltyCoeff,Point_id
0,Chocolate Sandwich Cookies,903304753514,21.6,0.99,6191
1,All-Seasons Salt,401309075522,25.9,0.58,6870
2,Robust Golden Unsweetened Oolong Tea,851892036179,8.47,0.66,406
3,Smart Ones Classic Favorites Mini Rigatoni Wit...,445454311328,13.53,0.77,5459
4,Green Chile Anytime Sauce,304753955703,22.73,0.24,4954


In [8]:
comments = pd.read_csv('data/RatingsComments.csv')
comments2 = pd.read_csv('data/RatingsComments2.csv')

In [9]:
points = pd.read_csv('data/Points.csv')

In [10]:
users = pd.read_csv('data/Users.csv')
users.head()

Unnamed: 0,ID,Email Address,LastName,FirstName,Username,DateJoined
0,1,Ronald_Ross9106@yvu30.app,Ross,Ronald,Ronald Ross,2022-05-09 14:55:54Z
1,2,Blake_Mcneill4917@lyvnc.space,Mcneill,Blake,Blake Mcneill,2022-03-13 18:06:07Z
2,3,Kenzie_Sherwood2227@d9un8.space,Sherwood,Kenzie,Kenzie Sherwood,2022-05-10 05:37:21Z
3,4,Matthew_Exton9191@chkzl.mobi,Exton,Matthew,Matthew Exton,2022-03-06 18:00:55Z
4,5,Chuck_Hill4239@jh02o.design,Hill,Chuck,Chuck Hill,2022-11-11 06:58:49Z


In [11]:
model_en = fasttext.load_model('models/cc.en.25.bin')

In [12]:
def text2vector(text: np.array):
    vectors = []

    for subtext in text:
        try:
            vector = model_en.get_word_vector(subtext)
        except Exception as e:
            continue

        vectors.append(vector)
    
    return vectors

## Рекомендация доступного товара вместо отсутствующего

In [13]:
def recommend_other_products(product_id=3, n_neighbors_amount=10):
    product_name = products[products.index == product_id].Name.values[0]
    product_vector = model_en.get_word_vector(product_name)
    
    other_products_vectors = text2vector(products[products.index != product_id].Name.values)

    knn = NearestNeighbors(n_neighbors=n_neighbors_amount)
    knn.fit(np.array(other_products_vectors))
    return knn.kneighbors(product_vector.reshape(1, -1), return_distance=False)[0]

In [14]:
recommend_other_products(3, 5)


array([ 5578, 24656, 41113, 23539,  8883])

## Поиск похожих пользователей

In [15]:
user_id = 3
comments2.head()

Unnamed: 0,name,rating,review,user_id
0,#FeelTheROLL,5.0,Had an egg chicken roll and a paneer roll Real...,1
1,#FeelTheROLL,5.0,Not just the Roll but the filling tastes great...,2
2,#FeelTheROLL,4.5,Very nice place complete value for money High...,3
3,#FeelTheROLL,5.0,Had an amazing mouthwatering chicken rollWorth...,4
4,#L-81 Cafe,4.0,This little cafe is set in a very beautiful lo...,5


In [16]:
orders_product

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_last_order,product_id,reordered,Name,BarCode,Price,LoaltyCoeff,Point_id
0,2539329,1,1,2,8,,196,0,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
5,2398795,1,2,3,7,15.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
11,473747,1,3,3,12,21.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
16,2254736,1,4,4,7,29.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
21,431534,1,5,4,15,28.0,196,1,Cold Brew Coffee Tahitian Vanilla,284125606973,16.19,0.39,7309
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31866630,1320836,202557,17,2,15,1.0,43553,1,Max Fresh Whitening Fluoride Toothpaste With M...,280170585114,4.35,0.28,7080
31866648,31526,202557,18,5,11,3.0,43553,1,Max Fresh Whitening Fluoride Toothpaste With M...,280170585114,4.35,0.28,7080
32011605,758936,203436,1,2,7,,42338,0,Italian Salad Dressing & Seasoning Mix Packets,603660604622,4.28,0.42,7364
32011621,2745165,203436,2,3,5,15.0,42338,1,Italian Salad Dressing & Seasoning Mix Packets,603660604622,4.28,0.42,7364


In [17]:
point_vectors = []
for point_id in points.index.values:
    point_vector = np.mean(text2vector(products[products.Point_id == 5459].Name.values), axis=0)
    point_vectors.append(point_vector)
point_vectors = np.array(point_vectors)

In [18]:
def get_related_users(user_id, n_neighbors_amount=10):
    vectors_num = 0
    comments_vector = np.zeros(25)
    product_vector = np.zeros(25)
    point_vector = np.zeros(25)

    def get_user_vector(user_id):
        nonlocal comments_vector, product_vector, point_vector, vectors_num

        user_comments2_df = comments2[comments2.user_id == user_id]
        _vectors = text2vector(user_comments2_df.review.values)
        
        if len(_vectors) != 0:
            comments_vector = np.mean(_vectors, axis=0)
            vectors_num += 1 

        product_names = products[products.index.isin(orders[orders.user_id == user_id].product_id.values)].Name.values
        if len(product_names) != 0:
            product_vector = np.mean(text2vector(product_names), axis=0)
            vectors_num += 1 

        points_attended = orders_product[orders_product.user_id == user_id].Point_id.values
        if len(points_attended) != 0:
            point_vector = np.mean(point_vectors[points_attended], axis=0)
            vectors_num += 1 

        if vectors_num == 0:
            vectors_num = 1
        return (comments_vector + product_vector + point_vector) / vectors_num

    other_users_vectors = []
    for user_id in users[users.index != user_id].index.values:
        vectors_num = 0

        comments_vector = np.zeros(25)
        product_vector = np.zeros(25)
        point_vector = np.zeros(25)

        user_vector = get_user_vector(user_id)
        other_users_vectors.append(user_vector)
    
    vectors_num = 0
    comments_vector = np.zeros(25)
    product_vector = np.zeros(25)
    point_vector = np.zeros(25)

    knn = NearestNeighbors(n_neighbors=n_neighbors_amount)
    # print(np.array(other_users_vectors))
    # display(np.array(other_users_vectors))
    knn.fit(np.array(other_users_vectors))
    
    return knn.kneighbors(get_user_vector(user_id).reshape(1, -1), return_distance=False)[0]



In [20]:
get_related_users(2)

array([998, 981, 979, 893, 983, 961, 960, 910, 899, 895])

## Преобразование данных

In [None]:
# df_dict = {
#     'name': [],
#     'rating': [],
#     'review': [],
#     'user_id': []
# }

In [None]:
# sorted_df = comments.sort_values('name')
# first_row = sorted_df.iloc[0]
# prev_name = first_row['name']
# user_id, max_user_id = 1, 10**3

# for i, row in sorted_df.iterrows():
#     df_dict['name'].append(row['name'])
#     df_dict['rating'].append(row['rating'])
#     df_dict['review'].append(row['review'])

#     id = user_id if user_id <= 10**3 else None
#     df_dict['user_id'].append(id)

#     user_id += 1
#     if prev_name != row['name']:
#         user_id = 1
#         prev_name = row['name']

# df = pd.DataFrame(df_dict).sort_index()

In [None]:
# df.to_csv('data/RatingsComments2.csv', index=False)

## Рекомендация товаров пользователям

In [23]:
def recommend_products(user_id=2, product_id=3, amount=10):
    related_users = get_related_users(user_id, n_neighbors_amount=3)

    all_products = orders[orders.user_id.isin(related_users)].product_id.values
    return np.unique(all_products)[:amount]


In [24]:
recommend_products(user_id=2, product_id=3)

array([  66,  277,  339,  397, 1034, 1244, 1688, 1940, 2050, 2099])

## Рекомендация точек пользователям

In [25]:
def recommend_points(user_id=2, point_id=3, amount=10):
    related_users = get_related_users(user_id, n_neighbors_amount=3)

    all_points = orders_product[orders_product.user_id.isin(related_users)].Point_id.values
    return np.unique(all_points)[:amount]

In [26]:
recommend_points(user_id=2, point_id=3)

array([ 47, 107, 112, 117, 204, 206, 238, 243, 271, 315])