# SAR Model

## Importamos librerias necesarias
> Divididas por uso

In [1]:
# Spark DF
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrame
from pyspark.sql.functions import lit

In [2]:
# Model
from pysarplus import SARPlus

In [5]:
!pip list

Package             Version
------------------- --------
ansiwrap            0.8.4
asttokens           2.2.1
attrs               23.1.0
backcall            0.2.0
Bottleneck          1.3.7
category-encoders   1.3.0
certifi             2023.5.7
charset-normalizer  3.1.0
click               8.1.3
comm                0.1.3
contourpy           1.0.7
cornac              1.15.4
cycler              0.11.0
debugpy             1.6.7
decorator           5.1.1
entrypoints         0.4
exceptiongroup      1.1.1
executing           1.2.0
fastjsonschema      2.16.3
filelock            3.12.0
fonttools           4.39.4
fsspec              2023.5.0
huggingface-hub     0.14.1
hypothesis          6.75.3
idna                3.4
importlib-metadata  6.6.0
importlib-resources 5.12.0
ipykernel           6.23.1
ipython             8.13.2
jedi                0.18.2
Jinja2              3.0.3
joblib              1.2.0
jsonschema          4.17.3
jupyter_client      8.2.0
jupyter_core        5.3.0
kiwisolver        

In [None]:
# Evaluators
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

## 1. Lectura de datos
> Leemos los datos del Cassandra.

In [3]:
load_options = {"table": "view_events", "keyspace": "clarovideo"}
df = spark.read.format("org.apache.spark.sql.cassandra").options(**load_options).load()

In [4]:
dataset = df.withColumn("view", lit(1))
dataset.count()

                                                                                

160708

In [5]:
(train, test) = dataset.randomSplit([.8, .2]) 

## 2. Inicializar el modelo
> Inicializacion del modelo SARPLUS

In [6]:
header = {
    "col_user": "user_id",
    "col_item": "group_id",
    "col_rating": "view",
    "col_timestamp": "event_time"
}

In [7]:
model = SARPlus(
    spark,
    similarity_type="jaccard",
    timedecay_formula=True,
    time_decay_coefficient=30,
    **header
)

## 3. Entrenamiento del modelo

> model.fit(train:DataFrame) arma en spark un DataFrame con una columna del tipo |User|Item|Score|, segun la multiplicacion de las matrices de afinidad.

In [8]:
with timer() as t:
    model.fit(train)
print("* Time: ", t.elapse)

NameError: name 'timer' is not defined

## 4. Recomendaciones 

In [10]:
TOP_K = 4
recommendations = model.recommend_k_items(test, top_k=TOP_K, remove_seen=False)

                                                                                

In [11]:
recommendations.show()



+-------+--------+--------------------+
|user_id|group_id|               score|
+-------+--------+--------------------+
|2047319|  769573|  0.9862464188748867|
|2047319|  933034| 0.04109360078645361|
|2047319| 1108920| 0.02817846911071105|
|2047319|  966910|0.014720095804102787|
|2077551|  834948|  1.7721307068269678|
|2077551|  835061|  1.7528912821296077|
|2077551|  835081|   1.637055649710639|
|2077551|  835088|  1.6094968218326975|
|2158715| 1032512|  0.9940463520546271|
|2158715| 1038292| 0.49702317602731355|
|2158715| 1038280| 0.37276738202048515|
|2158715|  835011|  0.1656743920091045|
|2160345| 1112156|  0.9893261613904413|
|2160345| 1112950|  0.3297753871301471|
|2160345| 1112953| 0.24733154034761032|
|2160345| 1113707| 0.19786523227808828|
|2283039| 1097586|  2.9529769995234973|
|2283039| 1101689|  2.9529769995234973|
|2283039| 1100203|  2.9529769995234973|
|2283039| 1117512|  0.9806659687470285|
+-------+--------+--------------------+
only showing top 20 rows




[Stage 45:>                                                         (0 + 1) / 1]

                                                                                

## 5. Test

### 5.1 Bajo la logica del SAR:

In [18]:
args = [test, top_k]
kwargs = dict(col_user='user_id', 
              col_item='group_id', 
              col_rating='view', 
              col_prediction='score', 
              relevancy_method='top_k', 
              k=TOPK)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

AttributeError: __base__

In [None]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

### 5.2 Como interseccion del test y las recomendaciones:
UsersInRecs = UsersInTrain $\cap$ UsersInTests

In [43]:
users_in_recs = set(recommendations.rdd.map(lambda x: x.user_id).collect()) 
print(users_in_recs == users_in_tests.intersection(users_in_train))

True


In [21]:
Views = dict() # user_id: [itemIds seen in test]

for user_id in list(users_in_recs):
    user_views = set(test_data.where(test_data.user_id == user_id).\
                      rdd.map(lambda x: x.group_id).collect())
    Views.update({user_id : user_views})

                                                                                

In [22]:
Views

{44603296: {553221, 554102},
 29080963: {1114857, 1114907},
 38198886: {1109110},
 79273447: {1119257},
 66520777: {583163},
 81934995: {925522},
 72086164: {952535, 954052},
 61385205: {928978},
 77273878: {1109700},
 29888861: {1071326, 1072130}}

In [30]:
Recommendations = dict()

for user_id in list(users_in_recs):
    user_views = set(recommendations.where(recommendations.user_id == user_id).\
                      rdd.map(lambda x: x.group_id).collect())
    Recommendations.update({user_id : user_views})

In [31]:
Recommendations

{44603296: {553981},
 29080963: {1113411, 1113412},
 38198886: {1110700},
 79273447: {610966, 973252},
 66520777: {583184, 583188},
 81934995: {1060889, 1060890},
 72086164: {924975, 925859},
 61385205: {929072},
 77273878: {1108217, 1111412},
 29888861: {1075184, 1075190}}

In [39]:
def weak_sar_test(Views:dict, Recommendations:dict) -> float:
    error = 0
    for key in Views.keys():
        views = Views[key]
        recs = Recommendations[key]
        ok = len(recs.intersection(views))
        if ok > 0:
            error += len(views ^ recs) / ok
        else:
            error += len(views ^ recs) 
    return error

In [42]:
weak_sar_test(Views,Recommendations)

4