# Packages

In [None]:
import pandas as pd
import seaborn as sns

# 1. Data import

## 1.1. Mentions

In [None]:
df = pd.read_csv('data/twitter_mentions.tsv', sep='\t', encoding='UTF-8', dtype={'Outlet or Author':'string', 'External Mention ID':'string'})
df.shape

## 1.3. Tweeters metadata

In [None]:
df_te = pd.read_csv('data/Twitter/tweeters_metadata.tsv', sep='\t', encoding='UTF-8', dtype={'id_str':'string'})
df_te.shape

## 1.2. Tweets metadata

In [None]:
df_tw = pd.read_csv('data/Twitter/tweets_metadata.tsv', sep='\t', encoding='UTF-8', dtype={'id_str':'string', 'user.id_str':'string'})
df_tw.shape

Tweeters' data and their tweets are matched to avoid inconsistencies.

In [None]:
df_tw = df_tw[df_tw['user.id_str'].isin(df_te.id_str)].copy()
df_tw.shape

In [None]:
df_te = df_te[df_te['id_str'].isin(df_tw['user.id_str'])].copy()
df_te.shape

Minor adjustments to correctly identify different types of tweets.

When a tweet is retweeted, the characteristics of this tweet are ignored.

In [None]:
df_tw.loc[df_tw['is_retweet'], 'is_quote_status'] = False

When a tweet is in response to another, it is identified.

In [None]:
df_tw['is_reply'] = False
df_tw.loc[~df_tw['in_reply_to_status_id_str'].isna(), 'is_reply'] = True

Finally, "normal" tweets are identified when they do not fall into any of the other categories.

In [None]:
df_tw['is_tweet'] = False
df_tw.loc[(df_tw['is_retweet']==False) & (df_tw['is_quote_status']==False) & (df_tw['is_reply']==False), 'is_tweet'] = True

It should be noted that there may be overlaps, especially between replies and quotes.

In [None]:
df_tw.loc[df_tw['is_reply'] & df_tw['is_quote_status'], 'id_str']

## 1.4. Researchers

In [None]:
df_r = pd.read_csv('data/Twitter/authors_tweeters_2022_08_21.csv', sep=',', encoding='UTF-8', dtype={'tweeter_id':'string'})
df_r.shape

## 1.5. Bots

In [None]:
df_b = pd.read_csv('data/Twitter/tweeters_bots.tsv', sep='\t', encoding='UTF-8', dtype={'user_id':'string'})
df_b.shape

In [None]:
sns.kdeplot(df_b['score_over'], fill=True)

In [None]:
df_b = df_b[df_b['score_over']>4].copy()
df_b.shape

# 2. Tweeters

By default, all actors are assigned as Social, except those that have been identified as bots, researchers or journals.

In [None]:
df_te['Type'] = 'Social'

In [None]:
df_te.loc[df_te['id_str'].isin(df_b.user_id), 'Type'] = 'Bot'

In [None]:
df_te.loc[df_te['id_str'].isin(df_r.tweeter_id), 'Type'] = 'Researcher'

Most of them are social an researchers.

In [None]:
df_te['Type'].value_counts()

## 2.1. Profile stats

In [None]:
df_te.groupby(['Type']).agg(Tweeters=('Type', 'count'),
                            Followers=('followers_count', 'mean'),
                            Friends=('friends_count', 'mean'),
                            Tweets=('statuses_count', 'mean')).round(2).reset_index()

In [None]:
df_te.groupby(['Type']).agg(Tweeters=('Type', 'count'),
                            Followers=('followers_count', 'median'),
                            Friends=('friends_count', 'median'),
                            Tweets=('statuses_count', 'median')).reset_index()

## 2.2. Activity stats

In [None]:
df_te_tw = df_te.merge(df_tw, left_on='id_str', right_on='user.id_str', how='inner')

In [None]:
df_te_tw.groupby(['Type']).agg(Tweets=('Type', 'count'),
                               Quote=('is_quote_status', 'sum'),
                               RT=('is_retweet', 'sum'),
                               Reply=('is_reply', 'sum'),
                               Tweet=('is_tweet', 'sum')).reset_index()

In [None]:
tweets_lang = df_te_tw[['Type', 'lang_y']].value_counts().reset_index()
tweets_lang[tweets_lang['lang_y']=='es']

## 2.3. Engagement stats

In [None]:
df_te_tw[~df_te_tw['is_retweet']].groupby(['Type']).agg(Tweets=('Type', 'count'),
                                                        RT=('retweet_count', 'mean'),
                                                        Favs=('favorite_count', 'mean')).round(2).reset_index()

In [None]:
df_te_tw[~df_te_tw['is_retweet']].groupby(['Type']).agg(Tweets=('Type', 'count'),
                                                        RT=('retweet_count', 'median'),
                                                        Favs=('favorite_count', 'median')).reset_index()

# 3. Publications

In [None]:
df = df[['External Mention ID', 'Details Page URL', 'DOI']].merge(df_te_tw, left_on='External Mention ID', right_on='id_str_y', how='inner').copy()
df.shape

In [None]:
df.groupby('Type').agg(Mentions=('Type', 'count')).round(2).reset_index()

In [None]:
df_mentions = df[~df['is_retweet']].groupby(['Details Page URL', 'Type']).agg(Mentions=('Details Page URL', 'count'),
                                                           Avg_fav=('favorite_count', 'mean'),
                                                           Avg_RT=('retweet_count', 'mean'),
                                                           Avg_fw=('followers_count', 'mean')).round(2).reset_index()

In [None]:
df_mentions_rt = df[df['is_retweet']].groupby(['Details Page URL', 'Type']).agg(RT=('Details Page URL', 'count')).round(2).reset_index()

In [None]:
df_mentions = df_mentions.merge(df_mentions_rt, how='left', on=['Details Page URL', 'Type'])
df_mentions.RT.fillna(0, inplace=True)

In [None]:
df_mentions.to_csv('data/Twitter/publications_metrics_mean.tsv', sep='\t', index=False)

In [None]:
df.to_csv('data/Twitter/publications_metrics.tsv', encoding='UTF-8', sep='\t', index=False)