# Data Transformation with Microsoft Academic Graph 

Microsift Academic Graph (MAG) is a large database with tables that include information about publications, authors, affiliations, journals and citation. In this notebook, we will work on a sample of MAG and transform it with Pandas.

In [1]:
# Importing packages for data transformation
import numpy as np
import pandas as pd

In [2]:
# Importing basic MAG tables
Papers = pd.read_csv('~/datasets/s4/MAG/Papers.csv')
PaperAuthorAffiliations = pd.read_csv('~/datasets/s4/MAG/PaperAuthorAffiliations.csv')
Authors = pd.read_csv('~/datasets/s4/MAG/Authors.csv')
Affiliations = pd.read_csv('~/datasets/s4/MAG/Affiliations.csv')
Journals = pd.read_csv('~/datasets/s4/MAG/Journals.csv') 

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\lliang06/datasets/s4/MAG/Papers.csv'

In [None]:
# "Papers" is a table with information about publications. It includes paper title, publication date, DOI numbers and more.
# We can link the table with other tables in MAG with ids to discover the relationship between different entities.
Papers.head()

In [None]:
# Choosing PaperId and PaperTitle
Papers.loc[0:1, ['PaperId', 'PaperTitle']]

In [None]:
# By linking PaperId and AuthorId with "PaperAuthorAffiliations" and "Author" table, 
# we can get the author names for publications

Papers.loc[4:5, ['PaperId', 'PaperTitle']].\
    merge(PaperAuthorAffiliations, how = 'inner', on = 'PaperId').\
    merge(Authors, how = 'inner', on = 'AuthorId')[['PaperTitle', 'OriginalAuthor']]

In [None]:
# By linking PaperId with "PaperAuthorAffiliations" and "Affiliations" table,
# we can get information about the affiliation for authors of papers we selected
Papers.loc[:10, ['PaperId', 'PaperTitle']].\
    merge(PaperAuthorAffiliations, how = 'inner', on = 'PaperId').\
    merge(Affiliations, how = 'inner', on = 'AffiliationId')[['PaperTitle', 'NormalizedName', 'OfficialPage']]

In [None]:
# Loading tables with field of study information

PaperFields = pd.read_csv('./datasets/s4/MAG/PaperFieldsOfStudy.csv')
Fields = pd.read_csv('./datasets/s4/MAG/FieldsOfStudy.csv')

In [None]:
# Checking the field of study for selected papers

pd.set_option('display.max_columns', None)  
Papers.loc[:10, ['PaperId', 'PaperTitle']].\
    merge(PaperFields, how = 'inner', on = 'PaperId').\
    merge(Fields, how = 'inner', on = 'FieldOfStudyId')[['PaperTitle', 'NormalizedName']].groupby('PaperTitle').agg(list)

In [None]:
# "Journals" table includes metadata of journals

Journals.head()

In [None]:
# Loading tables with citation information

PaperCitationContexts = pd.read_csv('./datasets/s4/MAG/PaperCitationContexts.csv')
PaperCitation = pd.read_csv('./datasets/s4/MAG/PaperCitation.csv')

In [None]:
PaperCitationContexts

In [None]:
# Check paper citation

Papers.loc[:30, ['PaperId', 'PaperTitle']].\
    merge(PaperCitation, how = 'inner', on = 'PaperId').\
    merge(Papers.rename({'PaperId': 'PaperReferenceId', 'PaperTitle': 'PaperReferenceTitle'}, axis = 1)[['PaperReferenceId', 'PaperReferenceTitle']],
          how = 'inner',
          on = 'PaperReferenceId')

In [None]:
# Check paper citation and the citation context

Papers.loc[:30, ['PaperId', 'PaperTitle']].\
    merge(PaperCitationContexts, how = 'inner', on = 'PaperId')[['PaperTitle', 'CitationContext']]