In [1]:
import os
from typing import Union

import pandas as pd
import ast

In [2]:
from scrapper import get_all_people, get_links_with_names

links_with_names = get_links_with_names(get_all_people())
pwr_people = [name for _, name in links_with_names]
pwr_people

['Piotr Bródka',
 'Tomasz Kajdanowicz',
 'Maciej Zięba',
 'Przemysław Kazienko',
 'Jan Kocoń',
 'Halina Kwaśnicka',
 'Urszula Markowska-Kaczmar',
 'Radosław Michalski',
 'Maciej Piasecki',
 'Stanisław Saganowski',
 'Jerzy Sas',
 'Piotr Szymański',
 'Martin Tabakov',
 'Piotr Syga',
 'Arkadiusz Janz',
 'Piotr Bielak',
 'Kamil Kanclerz',
 'Michał Karol',
 'Rajmund Klemiński',
 'Mateusz Nurek',
 'Bartosz Perz',
 'Krzysztof Rajda',
 'Wiktor Walentynowicz',
 'Piotr Zieliński',
 'Albert Sawczyn',
 'Denis Janiak',
 'Dominika Kunc',
 'Jakub Binkowski',
 'Joanna Baran',
 'Katarzyna Jabłońska',
 'Piotr Kawa',
 'Stanisław Woźniak']

In [3]:
def extract_coauthors(d: dict, curr_author: str):
    authors_list = d["author"].split(" and ")
    return [a for a in authors_list if curr_author.lower() not in a.lower()]

In [4]:
def load_df(path: Union[os.PathLike, str]) -> pd.DataFrame:
    tokens = path.split("/")[-1].split("_")
    author_id = tokens[0]
    author_name = tokens[1]
    df = pd.read_parquet(path, columns=["bib"])
    df["author_id"] = author_id
    df["author"] = author_name
    df["bib"] = df["bib"].apply(lambda x: ast.literal_eval(x))
    df["coauthor"] = df["bib"].apply(
        lambda d: extract_coauthors(d, curr_author=author_name)
    )
    df = df.explode("coauthor")
    return df

In [5]:
def load_all(directory_path: Union[os.PathLike, str]) -> pd.DataFrame:
    dfs = []
    for file in os.listdir(directory_path):
        file_path = f"{directory_path}/{file}"
        dfs.append(load_df(file_path))
    return pd.concat(dfs)

In [6]:
df = load_all("pubs_raw")
df

Unnamed: 0,bib,author_id,author,coauthor
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Albert Sawczyn
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Denis Janiak
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Łukasz Augustyniak
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Tomasz Kajdanowicz
1,{'title': 'Graph-level representations using e...,27,Jakub Binkowski,Albert Sawczyn
...,...,...,...,...
64,{'title': 'Contrastive Learning for Multi-Labe...,2,Maciej Zięba,Maciej Zieba
65,{'title': 'The proposal of service-oriented su...,2,Maciej Zięba,Maciej Zieba
66,{'title': 'On-line Bayesian Context Change Det...,2,Maciej Zięba,Jakub M Tomczak
67,{'title': 'Two-stage Writer Identification Usi...,2,Maciej Zięba,Maciej Zieba


In [7]:
df.isnull().sum()

bib           0
author_id     0
author        0
coauthor     88
dtype: int64

In [8]:
df = df.dropna()
df

Unnamed: 0,bib,author_id,author,coauthor
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Albert Sawczyn
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Denis Janiak
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Łukasz Augustyniak
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Tomasz Kajdanowicz
1,{'title': 'Graph-level representations using e...,27,Jakub Binkowski,Albert Sawczyn
...,...,...,...,...
64,{'title': 'Contrastive Learning for Multi-Labe...,2,Maciej Zięba,Maciej Zieba
65,{'title': 'The proposal of service-oriented su...,2,Maciej Zięba,Maciej Zieba
66,{'title': 'On-line Bayesian Context Change Det...,2,Maciej Zięba,Jakub M Tomczak
67,{'title': 'Two-stage Writer Identification Usi...,2,Maciej Zięba,Maciej Zieba


In [294]:
df["is_pwr"] = df["coauthor"].apply(lambda coauthor_name: coauthor_name in pwr_people)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_pwr"] = df["coauthor"].apply(lambda coauthor_name: coauthor_name in pwr_people)


In [295]:
pwr_df = df[df["is_pwr"] == True]
pwr_df

Unnamed: 0,bib,author_id,author,coauthor,is_pwr
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Albert Sawczyn,True
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Denis Janiak,True
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Tomasz Kajdanowicz,True
1,{'title': 'Graph-level representations using e...,27,Jakub Binkowski,Albert Sawczyn,True
1,{'title': 'Graph-level representations using e...,27,Jakub Binkowski,Denis Janiak,True
...,...,...,...,...,...
7,{'title': 'RAFEN--Regularized Alignment Framew...,15,Piotr Bielak,Tomasz Kajdanowicz,True
8,{'title': 'A deeper look at Graph Embedding Re...,15,Piotr Bielak,Jakub Binkowski,True
8,{'title': 'A deeper look at Graph Embedding Re...,15,Piotr Bielak,Albert Sawczyn,True
8,{'title': 'A deeper look at Graph Embedding Re...,15,Piotr Bielak,Tomasz Kajdanowicz,True


In [296]:
pwr_df["coauthor_id"] = pwr_df["coauthor"].apply(
    lambda coauthor_name: pwr_people.index(coauthor_name)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pwr_df["coauthor_id"] = pwr_df["coauthor"].apply(lambda coauthor_name: pwr_people.index(coauthor_name))


In [297]:
pwr_df

Unnamed: 0,bib,author_id,author,coauthor,is_pwr,coauthor_id
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Albert Sawczyn,True,24
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Denis Janiak,True,25
0,{'title': 'Fact-checking: relevance assessment...,27,Jakub Binkowski,Tomasz Kajdanowicz,True,1
1,{'title': 'Graph-level representations using e...,27,Jakub Binkowski,Albert Sawczyn,True,24
1,{'title': 'Graph-level representations using e...,27,Jakub Binkowski,Denis Janiak,True,25
...,...,...,...,...,...,...
7,{'title': 'RAFEN--Regularized Alignment Framew...,15,Piotr Bielak,Tomasz Kajdanowicz,True,1
8,{'title': 'A deeper look at Graph Embedding Re...,15,Piotr Bielak,Jakub Binkowski,True,27
8,{'title': 'A deeper look at Graph Embedding Re...,15,Piotr Bielak,Albert Sawczyn,True,24
8,{'title': 'A deeper look at Graph Embedding Re...,15,Piotr Bielak,Tomasz Kajdanowicz,True,1


In [298]:
rename_map = {
    "author_id": "source",
    "coauthor_id": "target",
    "author": "source_label",
    "coauthor": "target_label",
}

edges_list = pwr_df.drop(columns=["bib", "is_pwr"])
edges_list = edges_list.rename(columns=rename_map)
edges_list = edges_list[["source", "target", "source_label", "target_label"]]
edges_list

Unnamed: 0,source,target,source_label,target_label
0,27,24,Jakub Binkowski,Albert Sawczyn
0,27,25,Jakub Binkowski,Denis Janiak
0,27,1,Jakub Binkowski,Tomasz Kajdanowicz
1,27,24,Jakub Binkowski,Albert Sawczyn
1,27,25,Jakub Binkowski,Denis Janiak
...,...,...,...,...
7,15,1,Piotr Bielak,Tomasz Kajdanowicz
8,15,27,Piotr Bielak,Jakub Binkowski
8,15,24,Piotr Bielak,Albert Sawczyn
8,15,1,Piotr Bielak,Tomasz Kajdanowicz


In [299]:
edges_list = edges_list.sort_values(by=["source", "target"])
edges_list.to_csv("scholar_output.csv", index=False)

In [300]:
total_outgoing_edges = edges_list.groupby("source")["target"].count().to_dict()
num_outgoing_edges = (
    edges_list.groupby(["source", "target"])["target"].count().to_dict()
)
# display(total_outgoing_edges)
# display(num_outgoing_edges)

In [301]:
edges_list = edges_list.drop_duplicates()
edges_list

Unnamed: 0,source,target,source_label,target_label
7,0,1,Piotr Bródka,Tomasz Kajdanowicz
0,0,3,Piotr Bródka,Przemysław Kazienko
9,0,7,Piotr Bródka,Radosław Michalski
0,0,9,Piotr Bródka,Stanisław Saganowski
5,1,0,Tomasz Kajdanowicz,Piotr Bródka
...,...,...,...,...
24,9,4,Stanisław Saganowski,Jan Kocoń
24,9,8,Stanisław Saganowski,Maciej Piasecki
14,9,20,Stanisław Saganowski,Bartosz Perz
12,9,26,Stanisław Saganowski,Dominika Kunc


In [302]:
def calc_weight(source, target):
    return num_outgoing_edges[(source, target)] / total_outgoing_edges[source]

In [303]:
edges_list["weight"] = edges_list.apply(
    lambda row: calc_weight(row["source"], row["target"]), axis=1
)

In [304]:
edges_list = edges_list.sort_values(by=["source", "target"])
edges_list

Unnamed: 0,source,target,source_label,target_label,weight
7,0,1,Piotr Bródka,Tomasz Kajdanowicz,0.136364
0,0,3,Piotr Bródka,Przemysław Kazienko,0.500000
9,0,7,Piotr Bródka,Radosław Michalski,0.227273
0,0,9,Piotr Bródka,Stanisław Saganowski,0.136364
5,1,0,Tomasz Kajdanowicz,Piotr Bródka,0.076923
...,...,...,...,...,...
24,9,4,Stanisław Saganowski,Jan Kocoń,0.019231
24,9,8,Stanisław Saganowski,Maciej Piasecki,0.019231
14,9,20,Stanisław Saganowski,Bartosz Perz,0.096154
12,9,26,Stanisław Saganowski,Dominika Kunc,0.134615


In [305]:
edges_list.to_csv("scholar_weighted.csv", index=False)