## Summary

International views, local views and the rest for the different universities are calculated based on the languages of the countries of the universities and the linguistic edition of Wikipedia.

# Packages

In [None]:
import pandas as pd

# Country - official languages

<div class="alert alert-block alert-info"> <b>More info:</b> <a href="https://wiki.openstreetmap.org/wiki/Nominatim/Country_Codes">Nominatim/Country Codes</a></div>

Bibliometric indicators, that include the country name, Wikipedia page views by linguistic edition, and language tables (Wikipedia pages and languages by country) are imported.

In [None]:
leiden = pd.read_csv('data/leiden_indicators.tsv', sep='\t')
leiden.shape

In [None]:
langs = pd.read_csv('data/languages_clean.tsv', sep='\t')
langs.shape

In [None]:
langs_w = pd.read_csv('data/langlinks.tsv', sep='\t')
langs_w.shape

## Page views

The page views are by month, so a total number of views is generated by adding up all the months, although the last month (May 2022) is not considered because it is not data for the whole month.

In [None]:
views = pd.read_csv('data/views.tsv', sep='\t')
views

In [None]:
views['total'] = views.iloc[:,1:83].sum(axis=1)
views = views[['university', 'total']]
views

In [None]:
views[views.university.str.contains('Ottawa')]

In [None]:
views[views.university=='https://fr.wikipedia.org/wiki/Université_Clermont-Auvergne']

In [None]:
views.rename(columns={'university':'url'}, inplace=True)

In [None]:
langs_w[~langs_w.url.isin(views.url)].drop_duplicates()

In [None]:
langs_w[langs_w.url.duplicated()]

In [None]:
views[views.url.isin(langs_w.url)].drop_duplicates()

In [None]:
views = views.merge(langs_w, how='inner', on='url')

## Local/official language identification

All university countries (n=1225) are correctly identified.

In [None]:
leiden[leiden.Country.isin(langs['Country name'].tolist())].shape

Their official languages are identified and selected.

In [None]:
local_lang = langs[langs['Country name'].isin(leiden.Country.tolist())]

All of them can be linked to a Wikipedia edition, except:
+ rm - None page has been collected
+ om - None page has been collected
+ zgh - There is no Wikipedia edition
+ no - None page has been collected
+ sr-Latn - There is no Wikipedia edition
+ tn - None page has been collected

In [None]:
local_lang[~local_lang['Language code'].isin(langs_w.lang.tolist())]

# Total langlinks

In [None]:
langs_w = langs_w.merge(views[['url', 'total']], how='inner', on='url').drop_duplicates()
langs_w

In [None]:
leiden_local = leiden.merge(local_lang[['Country name', 'Language code']], how='inner', left_on='Country', right_on='Country name')[['id', 'short_name', 'full_name', 'Language code']].drop_duplicates()
leiden_local

In [None]:
leiden_local[leiden_local.short_name=='Univ Milan']

In [None]:
langs_w_views = langs_w.merge(leiden_local, how='left', left_on=['id', 'lang'], right_on=['id', 'Language code']).drop_duplicates()
langs_w_views

In [None]:
langs_w_views['type'] = ''
langs_w_views.loc[~langs_w_views.short_name.isna(), 'type'] = 'Local'

In [None]:
langs_w_views[langs_w_views.university=='Univ Milan']

In [None]:
langs_w_views[langs_w_views.university=='Univ Granada']

In [None]:
langs_w_views.to_csv('data/local_international_views.tsv', sep='\t', index=False)

In [None]:
total_lg = langs_w[['id', 'university']].groupby(['id', 'university']).size().reset_index(name='langlinks')
total_lg

# Total views

In [None]:
total_views = views[['id', 'total']].groupby('id')['total'].sum().reset_index(name='total_views').sort_values('total_views', ascending=False)
total_views

In [None]:
leiden = leiden.merge(total_lg[['id', 'langlinks']], how='inner', on='id')
leiden = leiden.merge(total_views, how='inner', on='id')
leiden

# Local views

In [None]:
local_views = leiden[['id', 'Country']]
local_views

In [None]:
local_views = local_views.merge(local_lang[['Country name', 'Language code']], left_on='Country', right_on='Country name')[['id', 'Country', 'Language code']].drop_duplicates()
local_views

In [None]:
local_views[local_views.id==638]

In [None]:
len(set(local_views['id'].tolist()))

In [None]:
local_views = local_views.merge(views[['id','lang','total']], how='inner', left_on=['id', 'Language code'], right_on=['id', 'lang'])[['id', 'Country', 'lang', 'total']].drop_duplicates()
local_views

In [None]:
len(set(local_views['id'].tolist()))

In [None]:
local_views[local_views['id']==638]

In [None]:
local_views[local_views['id']==1187]

In [None]:
local_views[local_views['id']==2578]

In [None]:
local_views[local_views['id']==45]

In [None]:
local_views = local_views[['id', 'total']].groupby('id')['total'].sum().reset_index(name='local_views').sort_values('local_views', ascending=False)
local_views

In [None]:
leiden = leiden.merge(local_views, how='inner', on='id')
leiden

# International views

In [None]:
int_views = views[views.lang=='en'][['id', 'total']]
int_views

In [None]:
int_views.rename(columns={'total':'int_views'}, inplace=True)

In [None]:
leiden = leiden.merge(int_views, how='inner', on='id')
leiden

# Rest views

In [None]:
rest_views = leiden[['id', 'Country']]
rest_views

In [None]:
rest_views = rest_views.merge(local_lang[['Country name', 'Language code']], left_on='Country', right_on='Country name')[['id', 'Country', 'Language code']].drop_duplicates()
rest_views

In [None]:
rest_views[rest_views.id==638]

In [None]:
rest_views = rest_views.merge(views[['id','lang','total']], how='right', left_on=['id','Language code'], right_on=['id','lang'])
rest_views

In [None]:
rest_views[rest_views.id==638]

In [None]:
rest_views = rest_views[rest_views['Language code'] != rest_views['lang']][['id', 'lang', 'total']].drop_duplicates()
rest_views = rest_views[rest_views['lang'] != 'en']
rest_views

In [None]:
rest_views[rest_views.id==638]

In [None]:
len(set(rest_views['id'].tolist()))

In [None]:
rest_views = rest_views[['id', 'total']].groupby('id')['total'].sum().reset_index(name='rest_views').sort_values('rest_views', ascending=False)
rest_views

In [None]:
leiden = leiden.merge(rest_views, how='left', on='id')
leiden

In [None]:
leiden[['rest_views']] = leiden[['rest_views']].fillna(0)
leiden[leiden['rest_views']==0]

In [None]:
leiden[leiden.id==638]

In [None]:
leiden.to_csv('data/leiden_wikipedia_indicators.tsv', sep='\t', index=False)