## Warning
Здесь и далее возможна некоторая временная и причинно-следственная неразбериха, так как я приступаю к данному домашнему заданию чуть ли не вперёд первого, однако это будет обязательно исправлено (но, к сожалению, уже после дедлайна).  

## Setup

In [1]:
!pip install xlearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os

import numpy as np
import datetime as dt

import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

import scipy as sp
from scipy.sparse import csr_matrix, csc_matrix

import xlearn as xl

import pandas as pd
import matplotlib.pyplot as plt

In [3]:
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks/HSE/HSE RecSys"
DATA_PATH = os.path.join(ROOT_PATH, "data", "data.csv")

## Data Preparation

__Read data__

In [4]:
# Read data with excess columns dropping
excess_data = ["banner_id0", "banner_id1", "rate0", "rate1", "g0", "g1", "coeff_sum0", "coeff_sum1"]
data = pd.read_csv(DATA_PATH).drop(excess_data, axis=1)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,1,1


__Feature engineering__

In [5]:
# Datetime features
data["date_time"] = pd.to_datetime(data["date_time"])
data["hour"] = data["date_time"].dt.hour

# Log-scale of 'campaign_clicks' feature
data["campaign_clicks_log"] = np.log(data["campaign_clicks"], where=data["campaign_clicks"] > 0)

Для использования FFM из библиотеки _xlearn_ необходимо привести данные к _libffm формату_, то есть к файлу вида:  
`label index_1:value_1 index_2:value_2 ... index_n:value_n`
  

__FFM dataset creation__

In [6]:
# Support function
def create_ffm_dataset(df, cat_cols, num_cols):
  """
  Input:
    df – original dataframe
    cat_cols – columns names of categorical features
    num_cols – columns names of numerical features
  
  Returns:
    mtrx – sparse features matrix
    fields – features fields for ffm 
  """
  idx = 0

  # Prepare numerical features
  idx = len(num_cols)
  fields = np.arange(idx)
  mtrx = csr_matrix(df[num_cols])

  # Prepare categorical features
  encoder = OneHotEncoder()
  for cat_feature in cat_cols:
    oh_col = encoder.fit_transform(df[[cat_feature]])
    fields = np.append(fields, np.repeat(idx, oh_col.shape[1]))
    mtrx = sp.sparse.hstack((mtrx, oh_col))
    idx += 1
  
  return mtrx, fields

In [7]:
# Create ffm dataset
cat_cols = ["zone_id", "banner_id", "oaid_hash", "os_id", "country_id", "hour"]
num_cols = ["campaign_clicks_log"]

X, fields = create_ffm_dataset(data, cat_cols, num_cols)  
y = data["clicks"]

In [8]:
# Train-val-test split by indexes
train_val_idxs = data.index[data["date_time"].dt.date != dt.date(2021, 10, 2)]
train_idx, val_idx = train_test_split(train_val_idxs, test_size=0.2, stratify=data.iloc[train_val_idxs]["clicks"])
test_idx = data.index[data["date_time"].dt.date == dt.date(2021, 10, 2)]

X_train = X.tocsr()[train_idx]
y_train = y.iloc[train_idx].to_numpy().ravel()

X_val = X.tocsr()[val_idx]
y_val = y.iloc[val_idx].to_numpy().ravel()

X_test = X.tocsr()[test_idx]
y_test = y.iloc[test_idx].to_numpy().ravel()

In [11]:
# Support function
def to_libffm_format(X, y, fields, fout):
  obs, features = X.nonzero()
  values = X.data
  
  with open(fout, "w") as file:
    cur_ob = obs[0]
    file.write(f"{y[cur_ob]}")
    
    for ob, feature, value in zip(obs, features, values):
      if ob == cur_ob:
        file.write(f" {fields[feature]}:{feature}:{value}")
      else:
        file.write("\n")
        cur_ob = ob
        file.write(f"{y[cur_ob]}")
        file.write(f" {fields[feature]}:{feature}:{value}")
        
  print(f"{fout} file with data in libffm format created")

In [None]:
# Convertation to libffm format
train_file = os.path.join(ROOT_PATH, "HW_2", "train.txt")
val_file = os.path.join(ROOT_PATH, "HW_2", "val.txt")
test_file = os.path.join(ROOT_PATH, "HW_2", "test.txt")

to_libffm_format(X_train, y_train, fields, train_file)
to_libffm_format(X_val, y_val, fields, val_file)
to_libffm_format(X_test, y_test, fields, test_file)

## FFM

Не успеваю, но вроде должно работать)  
Обязательно проверю, но, опять же, к сожалению, после дедлайна


__Train__

In [None]:
ffm_model = xl.create_ffm()
ffm_model.setTrain(train_file)
ffm_model.setValidate(val_file)

In [None]:
model_path = os.path.join(ROOT_PATH, "HW_2", "ffm_model.out")

param = {"task":"binary", "lr":0.2, "lambda":0.0001, "epoch":20, "opt":"adagrad"}
ffm_model.fit(param, "model_2.out")

__Test__

In [None]:
output = os.path.join(ROOT_PATH, "HW_2", "ffm_model.out")

ffm_model.setSigmoid()
ffm_model.setTest(test_file)
ffm_model.predict(model_path, "output.txt")