In [2]:
from pyspark.sql import SparkSession, functions as F

In [3]:
spark = SparkSession \
        .builder \
        .master('local[*]') \
        .appName("for recommender") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.maxResultSize", "3g") \
        .getOrCreate()

In [4]:
authors = spark.read.parquet('./fin_authors.parquet/')
authors.printSchema()

root
 |-- gid: string (nullable = true)
 |-- sid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orgid: string (nullable = true)
 |-- orgs_count: integer (nullable = true)
 |-- email: string (nullable = true)
 |-- organisation: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- id_user: integer (nullable = true)



In [5]:
coauthors = spark.read.parquet('./fin_coauths.parquet/')
coauthors.printSchema()

root
 |-- id_author: integer (nullable = true)
 |-- id_coauth: integer (nullable = true)



In [6]:
tags = spark.read.parquet('./id_tags.parquet')
tags.printSchema()

root
 |-- id: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- __index_level_0__: long (nullable = true)



In [7]:
auth_article = spark.read.parquet('./fin_auth_article.parquet/')
auth_article.printSchema()

root
 |-- id_author: integer (nullable = true)
 |-- id_article: integer (nullable = true)



In [8]:
auth_article.

SyntaxError: invalid syntax (<ipython-input-8-c8595ec638a7>, line 1)

In [9]:
auth_article.groupBy('id_author').count().orderBy(F.col('count').desc()).show()

+---------+-----+
|id_author|count|
+---------+-----+
|   951934|  239|
|   952532|  190|
|   531679|  162|
|   521932|  160|
|   574013|  157|
|   951203|  155|
|   530113|  150|
|   956989|  142|
|   529762|  141|
|   998379|  138|
|   469894|  131|
|   984235|  128|
|   646624|  126|
|   951375|  125|
|   428802|  122|
|   433651|  119|
|   470343|  115|
|   996987|  114|
|   258450|  114|
|   998918|  111|
+---------+-----+
only showing top 20 rows



In [42]:
author_tag = tags.where(F.col('tags') != '').join(
    auth_article,
    (auth_article.id_article == tags.id),
    "inner"
).select('tags', 'id_author').distinct()

In [43]:
author_tag.count()

29418687

In [48]:
auth_num_articles = auth_article \
                        .groupBy('id_author') \
                        .count() \
                        .orderBy(F.col('count').desc()) \
                        .withColumnRenamed('id_author', 'id_auth') \
                        .withColumnRenamed('count', 'num_articles') \
                        .distinct()

In [49]:
author_tag_nums = author_tag.join(
    auth_num_articles,
    auth_num_articles.id_auth == author_tag.id_author,
    "inner"
).drop('id_auth').orderBy(F.col('num_articles').desc())  # This is the last normalized version

In [50]:
author_tag_nums.groupBy('tags').agg(F.collect_list(F.col('id_author')).alias('author_ids')).show()

+----------------+--------------------+
|            tags|          author_ids|
+----------------+--------------------+
|          access|[951934, 952532, ...|
|          affect|[393927, 2047937,...|
|           agent|[951934, 952532, ...|
|       algorithm|[662204, 960207, ...|
|     alternative|[748586, 576419, ...|
|             art|[1991181, 1975163...|
|      assignment|[531679, 574013, ...|
|        baseline|[530113, 535163, ...|
|        becoming|[835313, 850156, ...|
|          blocks|[1025245, 469366,...|
|         bounded|[984505, 972606, ...|
|       broadcast|[531679, 984235, ...|
|         capable|[1029026, 351468,...|
|           carlo|[365790, 381530, ...|
|            cell|[36069, 94856, 12...|
|       character|[60627, 115978, 1...|
|characterization|[1725145, 251047,...|
|        circuits|[529762, 951375, ...|
|         classes|[506862, 599089, ...|
|     classifiers|[42546, 143032, 2...|
+----------------+--------------------+
only showing top 20 rows



In [51]:
denormalized_tag_author = author_tag_nums.groupBy('tags').agg(F.collect_list(F.col('id_author')).alias('author_ids'))

In [83]:
denormalized_tag_author.printSchema()

root
 |-- tags: string (nullable = true)
 |-- author_ids: array (nullable = false)
 |    |-- element: integer (containsNull = false)



In [78]:
pd_df = denormalized_tag_author.where(F.col('tags') == 'art').toPandas()
pd_df['author_ids'].explode()

0    1016434
0     844007
0     289514
0     680350
0     438400
      ...   
0    2115335
0    1629341
0    1018343
0    1531472
0    2424261
Name: author_ids, Length: 228, dtype: object

In [82]:
author_tag_nums.where(F.col('id_author').isin([1016434, 844007, 289514, 680350])).show(200)

+--------------+---------+------------+
|          tags|id_author|num_articles|
+--------------+---------+------------+
|classification|  1016434|          57|
|        fading|  1016434|          57|
|         radio|  1016434|          57|
|    ubiquitous|  1016434|          57|
|           snr|  1016434|          57|
|     broadcast|  1016434|          57|
|     detection|  1016434|          57|
|   cooperative|  1016434|          57|
|     transform|  1016434|          57|
|        traces|  1016434|          57|
|  technologies|  1016434|          57|
|    classifier|  1016434|          57|
|         clock|  1016434|          57|
|      students|  1016434|          57|
|   predictions|  1016434|          57|
|      ensemble|  1016434|          57|
|      delivery|  1016434|          57|
|        filter|  1016434|          57|
|        images|  1016434|          57|
|       problem|  1016434|          57|
|      learning|  1016434|          57|
|  coefficients|  1016434|          57|


Seems that the list is sorted as needed. That's good, but I have some suspicions whether this will hold...

Now for tag-authors table

In [85]:
author_tag_nums.count()

29418687

In [150]:
def get_recommendations(author_id: int):
    tags = author_tag_nums.where(F.col('id_author') == author_id).select('tags').toPandas()['tags'].tolist()
    local_authors = denormalized_tag_author \
                        .where(F.col('tags').isin(tags)) \
                        .select(F.slice(F.col('author_ids'), 1, 10).alias('author_ids')) \
                        .select(F.explode('author_ids'))
    return local_authors \
                .where(F.col('col') != author_id) \
                .sample(False, 1.0) \
                .limit(10) \
                .toPandas() \
                .rename(columns={'col': 'id_author'})

In [151]:
recs = get_recommendations(844007)

In [152]:
recs.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
id_author,1016434,289514,680350,438400,919052,739584,823727,2268485,683367,951375


In [153]:
pd_df = author_tag_nums.where(F.col('id_author').isin(recs['id_author'].tolist())).dropDuplicates(['id_author']).toPandas()

In [154]:
recs.merge(pd_df, on='id_author')

Unnamed: 0,id_author,tags,num_articles
0,1016434,identification,57
1,289514,task,44
2,680350,consumption,35
3,438400,recovery,31
4,919052,head,28
5,739584,decoding,24
6,823727,expressions,22
7,2268485,data,18
8,683367,interaction,18
9,951375,prediction,125


In [144]:
recs.merge(pd_df, on='id_author')

Unnamed: 0,id_author,tags,num_articles
0,844007,temporal,49
1,289514,task,44
2,680350,consumption,35
3,438400,recovery,31
4,919052,head,28
5,739584,decoding,24
6,823727,expressions,22
7,2268485,data,18
8,683367,interaction,18
9,863939,systems,16


In [124]:
p

0     289514
1     438400
2     680350
3     683367
4     739584
5     823727
6     844007
7     863939
8     919052
9    2268485
Name: id_author, dtype: int32

In [157]:
author_tag_nums.write.parquet('./author_tag_articlenum.parquet')

In [158]:
denormalized_tag_author.write.parquet('./denormalized_tag_authors.parquet')