In [3]:
# Exploratory data analysis
# We want more students to do research with professors. The incentives seem to exist: professors need students to publish and students need professors for guidance (what to ask, what papers to read, where to publish, etc).

# We have anecdotal evidence that

# If students approach professors without any direction, chances are low
# If students approach with their own problem, chances are better
# If students approach knowing the professor's research, chances are much better.
# So our initial dataset only has

# authors and the papers they wrote in author_and_papers.csv
# papers and the references in paper_and_references.csv


In [4]:
# Task 1 - basic exploration
# Load the data
# For both datasets, look at the first few, last few, and a random sample of records in the datasets (hint: pandas.DataFrame.sample()).
# (no code) Why do we care about looking at a few records
# How many professors are there in our dataset? (Multiple answers!)
# How many papers are in our dataset? (Multiple answers!)

In [5]:
import pandas as pd

In [6]:
au_paper = pd.read_csv('author_and_papers.csv')

In [7]:
pa_refer = pd.read_csv('paper_and_references.csv')

In [8]:
au_paper.head()

Unnamed: 0.1,Unnamed: 0,author_id,given_name,family_name,middle_name,paper_id,paper_title
0,0,2,S,Rahmann,,4,Snakemake--a scalable bioinformatics workflow ...
1,1,1678,J,Koster,,4,Snakemake--a scalable bioinformatics workflow ...
2,2,109,Victoria,Stodden,,15,Scientific tests and continuous integration st...
3,3,14,Darko,Marinov,,15,Scientific tests and continuous integration st...
4,4,12,Adhithya,Bhaskar,,15,Scientific tests and continuous integration st...


In [9]:
au_paper.tail()

Unnamed: 0.1,Unnamed: 0,author_id,given_name,family_name,middle_name,paper_id,paper_title
11737,11737,11723,H,March,De,11724,Irreducible convex paving for decomposition of...
11738,11738,3763,Belding,Scribner,H,11727,The effect in humans of extracellular ph chang...
11739,11739,3764,Ben,Uyeno,T,11727,The effect in humans of extracellular ph chang...
11740,11740,11726,Mario,Villamil,F,11727,The effect in humans of extracellular ph chang...
11741,11741,3762,James,Burnell,M,11727,The effect in humans of extracellular ph chang...


In [10]:
pa_refer.head()

Unnamed: 0.1,Unnamed: 0,auth_id,auth_title,ref_id,ref_title
0,0,20,Nonlinear evolution via spatially-dependent li...,278,"Penalized matrix decomposition for denoising, ..."
1,1,20,Nonlinear evolution via spatially-dependent li...,1392,Resting-state hemodynamics are spatiotemporall...
2,2,20,Nonlinear evolution via spatially-dependent li...,1498,Wide-field optical mapping of neural activity ...
3,3,20,Nonlinear evolution via spatially-dependent li...,2445,A quantitative description of membrane current...
4,4,20,Nonlinear evolution via spatially-dependent li...,2442,An orderly single-trial organization of popula...


In [11]:
pa_refer.tail()

Unnamed: 0.1,Unnamed: 0,auth_id,auth_title,ref_id,ref_title
3716,3716,11636,Population predictive checks,11702,Bayesianly justifiable and relevant frequency ...
3717,3717,11636,Population predictive checks,11703,Bayesian probabilistic matrix factorization us...
3718,3718,11636,Population predictive checks,11705,Scalable bayes via barycenter in wasserstein s...
3719,3719,11636,Population predictive checks,11706,Stan r&d: research and development expenditure...
3720,3720,11636,Population predictive checks,11709,Cross-validatory choice and assessment of stat...


In [12]:
au_paper.sample()

Unnamed: 0.1,Unnamed: 0,author_id,given_name,family_name,middle_name,paper_id,paper_title
7421,7421,4379,Shulei,Wang,,7104,Asymptotic normality and valid inference for g...


In [13]:
pa_refer.sample()

Unnamed: 0.1,Unnamed: 0,auth_id,auth_title,ref_id,ref_title
2647,2647,8039,A sharp blockwise tensor perturbation bound fo...,10366,A simple linear time (1+/spl epsiv/)-approxima...


In [14]:
# when we need to caculate how many different values in a column of a dataset, we can use .unique() method and use .shape()[0] to count.
# this method could help us find the unique values in a column.
au_paper.author_id.unique()

array([    2,  1678,   109, ..., 11712, 11723, 11726])

In [15]:
au_paper.author_id.unique().shape[0]

7098

In [16]:
# number of professor:
num_prof = au_paper.author_id.unique().shape[0]
print('there are {} professor in the data.'.format(num_prof))

there are 7098 professor in the data.


In [17]:
# get paper count

In [19]:
# set() only contains unique values, which can be used in counting number of unique numbers.
paper_set_1 = au_paper.paper_title.unique().tolist()
paper_set_2 = pa_refer.auth_title.unique().tolist()
paper_set_3 = pa_refer.ref_title.unique().tolist()

In [None]:
pap_titles = paper_set_1 + paper_set_2 + paper_set_3

In [None]:
len(pap_titles)

In [None]:
len(set([str(title).lower() for title in pap_titles]))

In [None]:
# Task 2 - Some analysis
# Calculate, then graph the following:

# How many papers are associated with each professor?
# How many authors does each paper have?
# How many references does each paper have?
# Are the same references being used by multiple papers?

In [None]:
# number of professor
au_paper.groupby('author_id').count()['paper_id']
# maybe there are duplicates data

In [None]:
len(au_paper.groupby('author_id').count()['paper_id'])

In [None]:
au_paper.groupby('paper_id').count()['author_id']

In [None]:
# method 2: using for-loops:
paper_count = []
for i in au_paper.author_id.unique():
    is_target = au_paper.author_id == i
    paps_by_author = au_paper.loc[is_target, 'paper_id'].unique()
    paper_count.append(paps_by_author.shape[0])

In [None]:
import seaborn as sns
sns.histplot(paper_count)
# this plot means: the number of authors who have about 0-10 passages is round 7000 people, and the number of authors who have 10-20 papers
# is a little people.

In [None]:
auth_count = []
for paper in au_paper.paper_id.unique():
    is_target_p = au_paper.paper_id == paper
    author_by_paper = au_paper.loc[is_target_p, 'author_id'].unique()
    auth_count.append(author_by_paper.shape[0])

In [None]:
sns.histplot(auth_count)

In [None]:
# we can also use .drop_duplicates() method to get unique values.
auth_cnt = au_paper.loc[:, ['paper_id', 'author_id']].drop_duplicates().groupby('paper_id').count()

In [None]:
auth_cnt

In [None]:
auth_cnt2 = auth_cnt.reset_index().author_id

In [None]:
auth_cnt2

In [None]:
sns.histplot(auth_cnt2)

In [None]:
ref_count = pa_refer.loc[:, ['ref_id', 'auth_id']].drop_duplicates().groupby('auth_id').count()

In [None]:
sns.histplot(ref_count)

In [None]:
ref_cnt2 = ref_count.reset_index()

In [None]:
ref_cnt2.head()

In [None]:
# when we need to sort the values, we can use the .sort_values() method, and the argumemt in the () is the column name you want to sort with.
auth_cnt.sort_values('author_id')

In [None]:
auth_count_ref = pa_refer.loc[:, ['auth_id', 'ref_id']].drop_duplicates().groupby('ref_id').count()

In [None]:
sns.histplot(auth_count_ref)

In [None]:
auth_count_ref.columns

In [None]:
auth_count_ref.loc[auth_count_ref.auth_id > 20,:]

In [None]:
pa_refer.loc[(pa_refer.ref_id == 299) | (pa_refer.ref_id == 917), 'ref_title'].unique()

In [None]:
# Task 3 - Targeted search
# Which and how many professors have a paper with the word "deep learning" in it? What about "bayesian"? What about "causal"?

In [None]:
# don't forget the capital/lower difference!!!
authors = []
for i in range(au_paper.shape[0]):
    if 'deep learning' in str(au_paper.iloc[i]['paper_title']).lower():
        if au_paper.iloc[i]['middle_name'] == True:
            name = au_paper.iloc[i]['given_name'] + au_paper.iloc[i]['middle_name'] + au_paper.iloc[i]['family_name']
            authors.append(name)
        else:
            name = au_paper.iloc[i]['given_name'] + au_paper.iloc[i]['family_name']
            authors.append(name)
authors_u = set(str(author) for author in authors)

In [None]:
len(authors_u)

In [None]:
len(authors)

In [None]:
# we can also use function to solve this problem, since the content we want to search is different time to time, so the function
# can be the better choice.
# when we deal with strings, we want to find some particular section in the string, we can use the find() function.
# when the string has the section, the result of find() is 0. otherwise, the result is -1.
string = 'shaflasdfjlkasdnljkfnasdjkvn'
string.find('s')
string.find('1')

In [None]:
def get_profs(keywords, au_paper):
    has_keywords = au_paper.paper_title.apply(lambda x: str(x).find(keywords) != -1)
    return au_paper.loc[has_keywords, ['given_name', 'family_name']]

In [None]:
get_profs('deep learning', au_paper).shape

In [None]:
# Task 4 - Joining the datasets
# If a paper has more authors, does it likely have more references? How are you testing this out?
# Do the same professors reference the same papers over multiple publications?

In [None]:
author_count2 = au_paper.loc[:, ['author_id', 'paper_id']].drop_duplicates().groupby('paper_id').count()  

In [None]:
author_count2.head(3)

In [None]:
ref_count = pa_refer.loc[:, ['auth_id', 'ref_id']].drop_duplicates().groupby('auth_id').count()

In [None]:
ref_count.head(3)

In [None]:
# when we need to join two data frames, we can use the merge() method to do it.

In [None]:
jdf = pd.merge(author_count2, ref_count, left_on = 'paper_id', right_on = 'auth_id')

In [None]:
jdf.head(3)

In [None]:
import seaborn as sns

sns.scatterplot(x = jdf.author_id, y = jdf.ref_id)