<a href="https://colab.research.google.com/github/SwordForShinobi/Retail-recommender-system/blob/main/RecSys_implicit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
!pip install implicit # if you get lucky, you"ll be able to install it like this
!sudo -H python3 -m pip install implicit --no-cache --force-reinstall --log ./implicit.txt # if you won't be so lucky
# this precautions are for windows

# Data preparation class

In [None]:
# This class prepares data before feeding them to model
class PrepareData:
  '''IMPORTANT! Use .read transactions() and/or .read_products() before, otherwise other methods will be unavaliable'''
  def __init__(self):
    self.trans_df = None
    self.prods_df = None
    self.purchases = None

  def read_transactions(self, trans_dir=None): # чтение файлов transactions и products
    '''Pass directory(string) of desired file you wish to create DataFrame from'''
    if trans_dir is not None:
      self.trans_df = pd.read_csv(trans_dir, sep=',')
      return self.trans_df
    else:
      print('Please, input directory')

  def read_products(self, prods_dir=None):
    '''Pass directory(string) of desired file you wish to create DataFrame from'''
    if prods_dir is not None:
      self.prods_df = pd.read_csv(prods_dir, sep=',')
      return self.prods_df
    else:
      print('Please, input directory')

  def create_user_list(self): # уникальные user_id из transactions
    if self.trans_df is not None:
      self.user_list_trans = self.trans_df.user_id.unique().tolist()
      return self.user_list_trans
    else:
      raise ValueError('Load data 1st! Read some dataframes!')

  def save_user_list(self, directory):
    '''type full directory, must be a string'''
    if self.user_list_trans is None:
        self.create_user_list()
    np.save(directory, self.user_list_trans)
    print("Saved successfully!")

  def read_user_list(self, directory): # чтение из файла списка уникальных юзеров
    '''type full directory, must be a string'''
    self.user_list_trans = np.load(directory+'.npy', allow_pickle=True)
    return list(self.user_list_trans)

  def trans_add_string(self, order_id, user_id, order_number, order_dow,
                       order_hour_of_day, days_since_prior_order, product_id,
                       add_to_cart_order, reordered): # + новая строка в транзакции
    empty_row = pd.DataFrame(np.array([order_id, user_id, order_number, order_dow, order_hour_of_day,
                              days_since_prior_order, product_id, add_to_cart_order, reordered]).reshape(1, len(self.trans_df.columns)),
                             columns=list(self.trans_df.columns))
    new_trans_df = self.trans_df.append(empty_row, ignore_index=True)
    return new_trans_df

  def prods_add_string(self, product_id, product_name, aisle_id, department_id,
                       aisle, department): # + новая строка в продукты
    empty_row = pd.DataFrame(np.array([product_id, product_name, aisle_id, department_id,
                              aisle, department]).reshape(1, len(self.prods_df.columns)),
                             columns=list(self.prods_df.columns))
    new_prods_df = self.prods_df.append(empty_row, ignore_index=True)
    return new_prods_df

  def each_user_orders(self, users, products, number_of_orders): # сколько раз отдельный юзер покупал каждый продукт
    '''all are strings:
        users - name of columns with user ids,
        products - name of columns with product ids,
        number_of_orders - counts purchases of each product ever done by the user'''
    self.users = users
    self.products = products
    self.number_of_orders = number_of_orders

    self.purchases = self.trans_df.groupby([users, products])[number_of_orders].size().reset_index()
    return self.purchases

  def save_user_orders(self, directory): # сохранение посчитанных покупок каждого юзера
    if self.purchases is not None:
      self.purchases.to_csv(directory, index=False, sep=',', encoding='utf-8')
      print("Saved successfully!")
    else:
      raise ValueError('Nothing to save! Create dataframe 1st!')

  def read_user_orders(self, directory): # загрузка посчитанных покупок каждого юзера (ранее сохраненных)
    self.purchases = pd.read_csv(directory, sep=',')
    return self.purchases
  
  def to_fit_coded(self):
    if self.purchases is not None:
      df = self.purchases
      for i in [self.users, self.products, self.number_of_orders]:
        df[i+'_coded'] = df[i]
      for i in [self.users+'_coded', self.products+'_coded', self.number_of_orders+'_coded']:
        df[i] = df[i].astype("category")
        df[i] = df[i].cat.codes
      df.columns = ['user_id', 'product_id', 'number_of_orders',
                    'user_id(coded)', 'product_id(coded)', 'number_of_orders(coded)']
      return df
    else:
      raise ValueError('Use each_user_orders() method 1st')
  
  def to_fit_pure(self): # так ли нам нужно кодировать? Ведь потом придется РАСкодировтаь +)
    if self.purchases is not None:
      df = self.purchases
      df.columns = ['user_id', 'product_id', 'number_of_orders']
      return df
    else:
      raise ValueError('Use each_user_orders() method 1st')

In [None]:
data = PrepareData()

In [None]:
data.read_transactions('directory') # to try out put here file from this repository: main/Data/transactions.csv

In [None]:
# Creates dataframe with coded transactions of each user to work with
to_fit = data.to_fit_coded()

# Model based on ALS

In [None]:
class ModelALS:
  '''Model class based on ALS algorythm'''
  def __init__(self, factors=None, iterations=None):
    self.factors = factors
    self.iterations = iterations

    if self.factors is not None and self.iterations is not None:
      self.model = ALS(self.factors, self.iterations)
    else:
      raise ValueError('Please set parameters when create class')

  def prepare_matrix_to_fit(self, df=None):
    self.df = df
    if self.df is not None: # передаем сюда ЗАКОДИРОВАННЫЕ столбцы
      row = np.array(self.df[self.df.columns[3]].values.tolist()) # users
      col = np.array(self.df[self.df.columns[4]].values.tolist()) # products
      data = np.array(self.df[self.df.columns[5]].values.tolist()) # how many purchased
      self.matrix = csr((data, (row, col)), dtype=np.float32)
      return self.matrix
    else:
      raise ValueError('Please pass dataframe to work with')
  
  def fit_model(self):
    self.model.fit(self.matrix.T)

  def predict_user(self, user, n, filter_already_liked_items=True):
    '''user = particular user id(integer)
        n = amount of items to predict(integer)'''
    code_user = self.df[self.df[self.df.columns[0]] == user][self.df.columns[3]].unique()[0]
    recommendations = [t[0] for t in self.model.recommend(code_user, self.matrix, n, filter_already_liked_items)]
    recs = self.df[self.df[self.df.columns[4]].isin(recommendations)][self.df.columns[1]].unique().tolist()
    return f'For user_id={user}, we recommend next items, ids: {recs}'

  def predict_many(self, users, n, filter_already_liked_items=True):
    '''users = users you wish to predict(list)
        n = amount of items to predict for each user(integer)'''
    self.users = users
    self.preds_list = []
    users_code = self.df[self.df[self.df.columns[0]].isin(users)][self.df.columns[3]].unique().tolist()
    decode_df = self.df[[self.df.columns[4], self.df.columns[1]]].drop_duplicates() # создадим короткий дф для раскодировки. Ниже эксперементально проверенно:
    # он очень сильно ускоряет процесс
    for i in users_code:
      recommendations = [t[0] for t in self.model.recommend(i, self.matrix, n, filter_already_liked_items, recalculate_user=True)]
      self.preds_list.append(recommendations)
    return self.preds_list

  def decode_predictions(self, preds_list = None):
    self.preds_list = preds_list
    if self.preds_list is not None:
      decode_df = self.df[[self.df.columns[4], self.df.columns[1]]].drop_duplicates() # создадим короткий дф для раскодировки. Ниже экспериментально проверенно
      decode = []

      for i in self.preds_list:
        decoded_string = decode_df[decode_df['product_id(coded)'].isin(i)]['product_id'].unique().tolist()
        decode.append(decoded_string)
      # Сразу уберем пунктуацию для сохр-я и отправки на кэггл:
      characters_to_remove = '[],'
      clear_items_list = []

      for string in decode:
        for i in characters_to_remove:
          string = str(string).replace(i, '')
        clear_items_list.append(string)
  
      answer = pd.DataFrame(np.array(self.users).reshape(len(self.users), 1), columns=['user_id'])
      answer['product_id'] = clear_items_list
      return answer
    else:
      raise ValueError('Use predict_many() method 1st then pass predictions')

  def save_model(self, path=None):
    if path is not None:
      with open(path, 'wb') as directory:
        pickle.dump(self.model, directory)
    else:
      raise ValueError('Please input desirable path/filename to save to')
    
  def load_model(self, path=None):
    if path is not None:
      with open(path, 'rb') as model_probe:
        self.model = pickle.load(model_probe)
    else:
      raise ValueError('Please input desirable path/filename to load from')

In [None]:
# factors=30, iterations=8
model = ModelALS(30, 8)

In [None]:
# Create a sparse user_item_weights matrix
matrix = model.prepare_matrix_to_fit(to_fit)

In [None]:
model.fit_model()

In [None]:
model.save_model('anywhere')

In [None]:
model.load_model('anywhere')

In [None]:
# Predict top k (top 10 in our case) for user 2653
model.predict_user(2653, 10)

In [None]:
# Predict top 10 for list of users
model.predict_many(to_fit['user_id'].unique().tolist(), 10)

# Adding kNN method

In [None]:
class kNN(ModelALS):
  def __init__(self, k=50):
    self.k=k
    self.model = CR(self.k)

  def prepare_matrix_to_fit(self, df=None):
    self.df = df
    if self.df is not None:
      row = np.array(self.df[self.df.columns[3]].values.tolist()) # users
      col = np.array(self.df[self.df.columns[4]].values.tolist()) # products
      data = np.array(self.df[self.df.columns[5]].values.tolist()) # how many purchased
      self.matrix = csr((data, (row, col)), dtype=np.double) # ради этого пришлось писать функцию полностью
      return self.matrix
    else:
      raise ValueError('Please pass dataframe to work with')

In [None]:
# Create for 10 neighbours
model_knn = kNN(10)

In [None]:
matrix_knn = model_knn.prepare_matrix_to_fit(to_fit)

In [None]:
model_knn.fit_model()

In [None]:
# predicts for list of users
model_knn.predict_many(to_fit['user_id'].unique().tolist(), 10)

In [None]:
# You may load, save and predict for single user as well