In [1]:
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = SparkSession \
        .builder \
        .master('local[*]') \
        .appName("for recommender") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.maxResultSize", "3g") \
        .getOrCreate()

In [4]:
author_tag_nums = spark.read.parquet('./author_tag_articlenum.parquet/')  # Path to the given parquet
denormalized_tag_author = spark.read.parquet('./denormalized_tag_authors.parquet/')  # Path to the given parquet

In [25]:
def get_recommendations(author_id: int):
    tags = author_tag_nums.where(F.col('id_author') == author_id).select('tags').toPandas()['tags'].tolist()
    local_authors = denormalized_tag_author \
                        .where(F.col('tags').isin(tags)) \
                        .select(F.slice(F.col('author_ids'), 1, 10).alias('author_ids')) \
                        .select(F.explode('author_ids'))
    return local_authors \
                .where(F.col('col') != author_id) \
                .distinct() \
                .limit(10) \
                .toPandas() \
                .rename(columns={'col': 'id_author'})

In [26]:
recs = get_recommendations(469894)

In [27]:
recs

Unnamed: 0,id_author
0,530113
1,521932
2,1025474
3,542855
4,428925
5,952532
6,998320
7,425451
8,421139
9,428802


In [28]:
pd_df = author_tag_nums.where(F.col('id_author').isin(recs['id_author'].tolist())).dropDuplicates(['id_author']).toPandas()

In [29]:
recs.merge(pd_df, on='id_author')

Unnamed: 0,id_author,tags,num_articles
0,530113,angle,150
1,521932,client,160
2,1025474,agents,108
3,542855,document,59
4,428925,alignment,81
5,952532,bits,190
6,998320,check,86
7,425451,activities,80
8,421139,architectures,91
9,428802,bandwidth,122
