# Packages

In [None]:
import pandas as pd

# Import data

## 1. Authors

In [None]:
df_wos = pd.read_csv('data/publications.csv', sep=',', encoding='UTF-8', dtype={'Accession Number':'str', 'Publication Date':'Int32'})
df_wos['Accession Number'] = df_wos['Accession Number'].str.replace('WOS:', '')
df_wos['DOI'] = df_wos['DOI'].str.lower()
df_wos.shape

In [None]:
df_ap = pd.read_csv('data/Authors/authors_papers.tsv', sep='\t', dtype=({'paper_id':'str', 'author_id':'str'}))
df_ap.shape

1869 UGR authors.

In [None]:
df_aut = pd.read_csv('data/Authors/authors.tsv', sep='\t', dtype=({'paper_id':'str', 'author_id':'str'}))
df_aut.shape

In [None]:
df_aut[df_aut['author_id'].isin(df_ap['author_id'].tolist())].shape

In [None]:
df_ap_sp = df_ap.merge(df_aut, how='inner', on='author_id')[['paper_id', 'new_author_id']].drop_duplicates().reset_index(drop=True)
df_ap_sp.shape

1899 papers.

In [None]:
len(set(df_ap_sp.paper_id))

In [None]:
df_wos[df_wos['Accession Number'].isin(df_ap['paper_id'].tolist())].shape

In [None]:
df_ap[df_ap['paper_id'].isin(df_wos['Accession Number'].tolist())].shape

# 2. Analysis

## 2.1. Twitter

In [None]:
df_tw_m = pd.read_csv('data/Twitter/publications_metrics.tsv', sep='\t', encoding='UTF-8', dtype={'Outlet or Author':'string', 'External Mention ID':'string'})
df_tw_m['DOI'] = df_tw_m['DOI'].str.lower()
df_tw_m.shape

In [None]:
df_tw_m[df_tw_m['DOI'].isin(df_wos['DOI'].tolist())].shape

In [None]:
df_tw_m = df_tw_m.merge(df_wos[['DOI', 'Accession Number']], how='inner', on='DOI').drop_duplicates().reset_index(drop=True)
df_tw_m.shape

In [None]:
df_tw_m = df_tw_m.merge(df_ap_sp, how='inner', left_on='Accession Number', right_on='paper_id')

In [None]:
df_tw_m['local'] = 0
df_tw_m.loc[df_tw_m['lang_y']=='es', 'local'] = 1

In [None]:
df_tw_stats_1 = df_tw_m[~df_tw_m['is_retweet']].groupby(['new_author_id']).agg(Tweets=('new_author_id', 'count'),
                                                                               Papers=('Details Page URL', 'nunique'),
                                                                               Local=('local', 'sum'),
                                                                               Avg_fav=('favorite_count', 'mean'),
                                                                               Avg_RT=('retweet_count', 'mean'),
                                                                               Avg_fw=('followers_count', 'mean')).round(2).reset_index()

In [None]:
df_tw_stats_2 = df_tw_m[df_tw_m['is_retweet']].groupby(['new_author_id']).agg(Retweets=('new_author_id', 'count')).round(2).reset_index()

In [None]:
df_tw_stats = df_tw_stats_1.merge(df_tw_stats_2, how='outer', on='new_author_id')
df_tw_stats.fillna(0, inplace=True)
df_tw_stats['Mentions'] = df_tw_stats['Tweets'] + df_tw_stats['Retweets']

In [None]:
df_tw_stats.to_csv('data/Authors/tweets_metrics.tsv', sep='\t', index=False)

In [None]:
df_tw_stats_3 = df_tw_m.groupby(['new_author_id', 'Type']).agg(Mentions=('new_author_id', 'count')).round(2).reset_index()

In [None]:
df_tw_stats_3.to_csv('data/Authors/tweets_types.tsv', sep='\t', index=False)

## 2.2. News

In [None]:
df_nw_m = pd.read_csv('data/News/publications_metrics.tsv', sep='\t', encoding='UTF-8', dtype={'Outlet or Author':'string', 'External Mention ID':'string'})
df_nw_m['DOI'] = df_nw_m['DOI'].str.lower()
df_nw_m.shape

In [None]:
df_nw_m[df_nw_m['DOI'].isin(df_wos['DOI'].tolist())].shape

In [None]:
df_nw_m = df_nw_m.merge(df_wos[['DOI', 'Accession Number']], how='inner', on='DOI').drop_duplicates().reset_index(drop=True)
df_nw_m.shape

In [None]:
df_nw_m = df_nw_m.merge(df_ap_sp, how='inner', left_on='Accession Number', right_on='paper_id')

In [None]:
df_nw_stats = df_nw_m.groupby(['new_author_id']).agg(Mentions=('new_author_id', 'count'),
                                                       Papers=('Details Page URL', 'nunique'),
                                                       Local=('Local', 'sum'),
                                                       Visits=('Engagments.Visits', 'mean'),
                                                       Time_Visits=('Engagments.TimeOnSite', 'mean'),
                                                       Page_Visit=('Engagments.PagePerVisit', 'mean'),
                                                       Bounce=('Engagments.BounceRate', 'mean')).round(2).reset_index()

In [None]:
df_nw_stats.to_csv('data/Authors/news_metrics.tsv', sep='\t', index=False)

In [None]:
df_nw_stats_2 = df_nw_m.groupby(['new_author_id', 'main_category']).agg(Mentions=('new_author_id', 'count')).round(2).reset_index()

In [None]:
df_nw_stats_2.to_csv('data/Authors/news_types.tsv', sep='\t', index=False)

## 2.3. Wikipedia

In [None]:
df_wp_m = pd.read_csv('data/Wikipedia/publications_metrics.tsv', sep='\t', encoding='UTF-8', dtype={'Outlet or Author':'string', 'External Mention ID':'string'})
df_wp_m['DOI'] = df_wp_m['DOI'].str.lower()
df_wp_m.shape

In [None]:
df_wp_m[df_wp_m['DOI'].isin(df_wos['DOI'].tolist())].shape

In [None]:
df_wp_m = df_wp_m.merge(df_wos[['DOI', 'Accession Number']], how='inner', on='DOI').drop_duplicates().reset_index(drop=True)
df_wp_m.shape

In [None]:
df_wp_m = df_wp_m.merge(df_ap_sp, how='inner', left_on='Accession Number', right_on='paper_id')

In [None]:
df_wp_stats = df_wp_m.groupby(['new_author_id']).agg(Mentions=('new_author_id', 'count'),
                                                     Papers=('Details Page URL', 'nunique'),
                                                     Local=('local', 'sum'),
                                                     Avg_views=('pageviews', 'mean'),
                                                     Avg_edits=('revisions', 'mean'),
                                                     Avg_words=('words', 'mean'),
                                                     Avg_trans=('langs', 'mean')).round(2).reset_index()

In [None]:
df_wp_stats.to_csv('data/Authors/wikipedia_metrics.tsv', sep='\t', index=False)

In [None]:
df_wp_stats_2 = df_wp_m.groupby(['new_author_id', 'major_topic']).agg(Mentions=('new_author_id', 'count')).round(2).reset_index()

In [None]:
df_wp_stats_2.to_csv('data/Authors/wikipedia_types.tsv', sep='\t', index=False)