In [305]:
# import pandas as pd
# train_data = pd.read_csv('train.csv')
# user_data = pd.read_csv('user.csv')
# video_data = pd.read_csv('video.csv')
# owner_data = pd.read_csv('owner.csv')

In [326]:
from pprint import pprint

import numpy as np
import pandas as pd


from tqdm.auto import tqdm

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter, cross_validate


In [329]:
ratings = pd.read_csv(
    "train2.csv",
    low_memory=False,
    names=[Columns.Weight, Columns.Datetime, Columns.User, Columns.Item]
)

users = pd.read_csv("user_cleaned.csv", low_memory=False,
                    names=[Columns.User, "gender", "age", "language", "city", "time"]
                    )
movies = pd.read_csv(
    "vid_n_own.csv",
    low_memory=False,
    names=[Columns.Item, "owner_id", "dur", "st", "subs", "ls_date", "city", "create_date"],
    encoding_errors="ignore",
)
ratings = ratings.drop(ratings.index[0])


In [330]:
ratings = ratings.astype(int)
movies = movies.astype(int)
users = users.astype(int)

In [331]:
ratings['datetime'] = pd.to_datetime(ratings['datetime'], unit='ms')
users['time'] = users['time']
movies['create_date'] = movies['create_date'] // 1000
movies['ls_date'] = movies['ls_date'] // 1000
movies['st'] = movies["st"] // 1000

In [332]:
users = users.loc[users["user_id"].isin(ratings["user_id"])].copy()
movies = movies.loc[movies["item_id"].isin(ratings["item_id"])].copy()

In [333]:
user_features_frames = []
for feature in ["gender", "age", "language", "city", "time"]:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [334]:
movies_features_frames = []
for feature in ["owner_id", "dur", "st", "subs", "ls_date", "city"]:
    feature_frame = movies.reindex(columns=["item_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    movies_features_frames.append(feature_frame)
movies_features = pd.concat(movies_features_frames)

In [335]:
dataset = Dataset.construct(
    ratings,
    user_features_df=user_features,
    item_features_df=movies_features,
    cat_user_features=["gender"],
    cat_item_features=["city", "owner_id"],
    make_dense_user_features=False,
    make_dense_item_features=False
    # for `sparse` format
)

In [336]:
dataset

Dataset(user_id_map=IdMap(external_ids=array([126492, 117764,  11347, ...,  37071,   3642, 146066])), item_id_map=IdMap(external_ids=array([228525, 204343, 201337, ...,  17872,  53341, 135163])), interactions=Interactions(df=         user_id  item_id  weight                datetime
1              0        0     1.0 2023-10-02 18:24:39.748
2              1        1     1.0 2023-10-02 18:24:39.760
3              2        2     1.0 2023-10-02 18:24:39.988
4              3        3     1.0 2023-10-02 18:24:40.451
5              3        3     1.0 2023-10-02 18:24:40.451
...          ...      ...     ...                     ...
5657320    45079    41511     1.0 2023-10-10 14:00:05.821
5657321    87648    52566     1.0 2023-10-10 14:00:06.884
5657322    12957    11657     1.0 2023-10-10 14:00:07.292
5657323    12646      524     1.0 2023-10-10 14:00:07.461
5657324    88477    14776    10.0 2023-10-10 14:00:07.541

[5657324 rows x 4 columns]), user_features=SparseFeatures(values=<152911x6 spa

In [337]:
from rectools.models import ImplicitALSWrapperModel

In [338]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel

In [339]:
import rectools

In [340]:
# n_splits = 2
# 
# splitter = TimeRangeSplitter(
#     test_size="15D",
#     n_splits=n_splits,
#     filter_already_seen=True,
#     filter_cold_items=True,
#     filter_cold_users=True,
# )

In [341]:
# splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2023-10-02 00:00:00', freq='15D'),
  Timestamp('2023-10-17 00:00:00', freq='15D')),
 (Timestamp('2023-10-17 00:00:00', freq='15D'),
  Timestamp('2023-11-01 00:00:00', freq='15D'))]

In [342]:
# Take few simple models to compare
# models = {
#     "random": RandomModel(random_state=42),
#     "popular": PopularModel(),
#     "most_raited": PopularModel(popularity="sum_weight"),
#     "tfidf_k=5": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5)),
#     "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10)),
#     "bm25_k=10_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=10, K1=0.05, B=0.1)),
#     }

# models = {
#     "bm25_k=25_k1=0.05_b=0.2": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=25, K1=0.05, B=0.2))
# }
# # We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
# metrics = {
#     "prec@1": Precision(k=1),
#     "prec@10": Precision(k=10),
#     "recall": Recall(k=10),
#     "novelty": MeanInvUserFreq(k=10),
#     "serendipity": Serendipity(k=10),
# }
# 
# K_RECS = 10
# "bm25_k=50_k1=0.5_b=0.01": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=50, K1=0.5, B=0.01)) 0.049

In [343]:
# cv_results = cross_validate(
#     dataset=dataset,
#     splitter=splitter,
#     models=models,
#     metrics=metrics,
#     k=K_RECS,
#     filter_viewed=True,
# )

KeyboardInterrupt: 

In [None]:
# pd.DataFrame(cv_results["splits"])

In [None]:
# pd.DataFrame(cv_results["metrics"]).head(100)

In [280]:
# pivot_results = (
#     pd.DataFrame(cv_results["metrics"])
#     .drop(columns="i_split")
#     .groupby(["model"], sort=False)
#     .agg(["mean", "std"])
# )
# mean_metric_subset = [(metric, "mean") for metric in pivot_results.columns.levels[0]]
# (
#     pivot_results.style
#     .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
#     .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
# )
# "bm25_k=20_k1=0.5_b=0.01" : 0.048, 0.032, 0.036
# "bm25_k=10_k1=0.5_b=0.03": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=10, K1=0.5, B=0.03))
# "bm25_k=10_k1=0.5_b=0.05": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=10, K1=0.5, B=0.05))


Unnamed: 0_level_0,prec@1,prec@1,prec@10,prec@10,recall,recall,novelty,novelty,serendipity,serendipity
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
bm25_k=25_k1=0.05_b=0.2,0.04494,0.002075,0.030753,0.00624,0.034236,0.00783,8.065087,3.441802,6.4e-05,4.5e-05
bm25_k=25_k1=0.05_b=0.23,0.044572,0.003226,0.030677,0.006559,0.034105,0.008091,8.147286,3.392254,6.5e-05,4.4e-05
bm25_k=25_k1=0.052_b=0.2,0.044993,0.00229,0.030812,0.006401,0.034291,0.007996,8.084979,3.428042,6.5e-05,4.4e-05
bm25_k=25_k1=0.052_b=0.23,0.044777,0.003733,0.0307,0.00661,0.034145,0.008182,8.170645,3.37548,6.6e-05,4.3e-05
bm25_k=22_k1=0.05_b=0.2,0.044842,0.001976,0.030706,0.006247,0.034117,0.007763,8.088164,3.412911,6.4e-05,4.4e-05
bm25_k=22_k1=0.05_b=0.23,0.04461,0.003338,0.03067,0.006587,0.034083,0.008096,8.175183,3.356423,6.6e-05,4.3e-05
bm25_k=22_k1=0.052_b=0.2,0.044814,0.002077,0.030728,0.006338,0.034174,0.007887,8.10974,3.396973,6.5e-05,4.4e-05
bm25_k=22_k1=0.052_b=0.23,0.044675,0.003649,0.030678,0.006622,0.034062,0.008109,8.195929,3.343871,6.6e-05,4.3e-05
bm25_k=25_k1=0.05_b=0.17,0.044936,0.001315,0.03088,0.006064,0.034458,0.007757,7.973529,3.503612,6.3e-05,4.6e-05
bm25_k=25_k1=0.05_b=0.19,0.044981,0.001935,0.030833,0.006252,0.034348,0.007893,8.035474,3.460658,6.4e-05,4.5e-05


In [138]:
# recos = model.recommend(
#     users=ratings[Columns.User].unique(),
#     dataset=dataset,
#     k=10,
#     filter_viewed=True,
# )

In [139]:
# user_id = 126492

In [140]:
# recos

NameError: name 'recos' is not defined

In [None]:
# recos.to_csv('ffffflllsst.csv', index=False)

In [None]:
# from tqdm import tqdm
# import numpy as np
# 
# us = pd.read_csv('user_cleaned_copy.csv').iterrows()
# dt = []
# for idx, row in tqdm(recos):
#     f = [row[1], []]
#     user_id = row[1]
#     user_recos = recos.query(f"user_id == @user_id").merge(movies, on="item_id")
#     for idx2, row2 in user_recos.iterrows():
#         f[1].append(row2[1])
#     dt.append(f)
# usr = []
# vi = []

In [359]:
model = rectools.models.ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.5, B=0.03))
model.fit(dataset)
recos = model.recommend(
    users=ratings[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

In [360]:
g = list(recos.iterrows())
ans = {}
for idx, row in g:
    us_id = int(row['user_id'])
    item_id = int(row['item_id'])
    if us_id in ans:
        ans[us_id].append(item_id)
    else:
        ans[us_id] = [item_id]


In [361]:
dt = []
us = pd.read_csv('user_cleaned_copy.csv')
g = [188589, 131005, 51799, 166282, 34204, 217384, 169519, 132648, 45311, 24671]
us = list(us.iterrows())
for idx, rw in us:
    i = int(rw['a'])
    if i in ans:
        t = [i, []]
        f =[]
        k = list(set(ans[i]))
        if len(k)>=10:
            f = k[:10]
        else:
            f = k
            q = 0
            while len(f)<10:
                if g[q] not in f:
                    f.append(g[q])
                q+=1
        t[1] = f
        dt.append(t)
    else:
        dt.append([i, g])

In [362]:
usr = []
vi = []
for i in dt:
    usr.append(int(i[0]))
    vi.append(' '.join(map(lambda x: str(int(x)), i[1])))
np.array(usr).astype('int')
recommendation = pd.DataFrame()
recommendation["user_id"] = np.array(usr).astype('int')
recommendation["recommendation"] = np.array(vi)
recommendation.to_csv("sol_K=25_K1=0-05_B=0-2.csv", index=False)

In [358]:
recommendation.to_csv("123123123123sol_K=26_K1=0-05_B=0-2.csv", index=False)