# Packages

In [None]:
import pandas as pd
import requests
import json
from functions_alt import wiki_ores, wikipedia_data, wikipedia_views

# 1. Altmetric mentions

In [None]:
df = pd.read_csv('data/wikipedia_mentions.tsv', sep='\t', encoding='UTF-8', dtype={'Outlet or Author':'string', 'External Mention ID':'string'})
df.shape

The URL of the citation is processed, which includes the revision of the Wikipedia that made the citation.

First, the language of Wikipedia is identified.

In [None]:
df['Wikipedia_lg'] = df['Mention URL'].str.replace('^http://', '', regex=True)
df['Wikipedia_lg'] = df['Wikipedia_lg'].str.replace('\.wikipedia.*$', '', regex=True)
df['Wikipedia_lg']

Second, the revision number is identified.

In [None]:
df['Wikipedia_rev'] = df['Mention URL'].str.replace('^.*oldid=|#.*', '', regex=True)
df['Wikipedia_rev']

# 2. Wikipedia API

## 2.1. Redirects

The user and page id (redirects are solved) are retrieved, this allow us to identify mentions with errors.

In [None]:
df['pageid'] = None
df['user'] = None

for i in range(df.shape[0]):
    lang=df['Wikipedia_lg'][i]
    revid=df['Wikipedia_rev'][i]
    
    try:
        url_info = 'https://'+lang+'.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=user&revids='+revid+'&format=json'

        query_info = requests.get(url_info, verify=False)
        response_info = json.loads(query_info.text)
        pageid = [x for x in response_info['query']['pages']][0]
        df.loc[i, 'pageid'] = pageid
        df.loc[i, 'user'] = response_info['query']['pages'][pageid]['revisions'][0]['user']
        
    except:
        pass

In [None]:
df = df[~df['pageid'].isna()].copy().reset_index(drop=True)
df.shape

## 2.2. ORES

Finally, the Quality and Topic are predicted for each mention (not all languages include it).

In [None]:
df_aux = wiki_ores(df[['Wikipedia_lg', 'Wikipedia_rev']])

In [None]:
df_aux.major_topic.value_counts()

In [None]:
df_aux.drop_duplicates().shape[0]==len(set(df_aux.revid))

In [None]:
df = df.merge(df_aux.drop_duplicates(), how='inner', left_on=['Wikipedia_lg', 'Wikipedia_rev'], right_on=['wiki', 'revid'])
df.drop(['wiki', 'revid'], axis=1, inplace=True)

In [None]:
df.to_csv('data/Wikipedia/mentions.tsv', sep='\t', index=False, encoding='UTF-8')

## 2.3. Pages metadata

The API is then queried to obtain the page ID and thereby retrieve all the metadata of the Wikipedia pages.

In [None]:
df_pages = df[['Mention Title', 'Wikipedia_lg', 'pageid']].drop_duplicates()
df_pages.rename({'pageid':'Wikipedia_page_id'}, axis=1, inplace=True)
df_pages.reset_index(drop=True, inplace=True)

### 2.3.1. Languages

In [None]:
df_pages['langs'] = 0

for i in range(df_pages.shape[0]):
    url_query = 'https://' + df_pages.loc[i,'Wikipedia_lg'] + '.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&lllimit=max&llprop=url&pageids=' + df_pages.loc[i,'Wikipedia_page_id']
    query_json = requests.get(url_query).json()
    try:
        df_pages.loc[i,'langs'] = len([x['lang'] for x in query_json['query']['pages'][df_pages.loc[i,'Wikipedia_page_id']]['langlinks']])
    except:
        pass

df_pages['langs'] = df_pages['langs']+1

### 2.3.2. Metrics

Before making the API request, a URL needs to be constructed.

In [None]:
df_pages['url'] = df_pages['Wikipedia_lg'] + '.wikipedia.org/' + df_pages['Mention Title']

In [None]:
wikidata = wikipedia_data(df_pages['url'].tolist())

In [None]:
wikiviews = wikipedia_views(df_pages['url'].tolist())

In [None]:
wikiviews['pageviews'] = wikiviews.sum(numeric_only=True, axis=1)

In [None]:
wikidata = wikidata.merge(wikiviews[['page_title', 'pageviews']], how='inner', left_on='url', right_on='page_title').drop('page_title', axis=1)

In [None]:
df_pages = df_pages.merge(wikidata, how='inner', on='url')

Add topics.

In [None]:
df_pages = df_pages.merge(df[['Mention Title', 'Wikipedia_lg', 'major_topic']].drop_duplicates(), how='inner', on=['Mention Title', 'Wikipedia_lg'])

## 2.4. ORES (Enlgish)

As it has not been possible to identify the topics of all pages, the topics of the respective English editions will be retrieved.

In [None]:
df_page_no = df_pages[df_pages['major_topic'].isna()].reset_index(drop=True).copy()

In [None]:
df_int_lg = pd.DataFrame(columns=['page_id', 'lang_link', 'title_link'])

for i in range(df_page_no.shape[0]):
    lang=df_page_no['Wikipedia_lg'][i]
    title=df_page_no['Mention Title'][i]
    pageid=df_page_no['Wikipedia_page_id'][i]
    
    try:
        url_info = 'https://'+lang+'.wikipedia.org/w/api.php?action=query&prop=langlinks&titles='+title+'&lllimit=500&format=json'
        query_info = requests.get(url_info, verify=False)
        response_info = json.loads(query_info.text)
        
        for j in response_info['query']['pages'][pageid]['langlinks']:
            df_int_lg = pd.concat([df_int_lg,
                                   pd.DataFrame({'page_id':pageid,
                                                 'lang_link':j['lang'],
                                                 'title_link':j['*']}, index=[0])])
    except:
        pass

df_int_lg = df_int_lg[df_int_lg['lang_link']=='en'].reset_index(drop=True)

In [None]:
df_int_lg['revid'] = None

for i in range(df_int_lg.shape[0]):
    lang=df_int_lg['lang_link'][i]
    pageti=df_int_lg['title_link'][i]
    
    try:
        url_info = 'https://'+lang+'.wikipedia.org/w/api.php?action=query&titles='+pageti+'&prop=revisions&rvprop=ids&rvslots=%2A&format=json'

        query_info = requests.get(url_info, verify=False)
        response_info = json.loads(query_info.text)
        pageid = [x for x in response_info['query']['pages']][0]
        df_int_lg['revid'][i] = response_info['query']['pages'][pageid]['revisions'][0]['revid']
        
    except:
        pass

In [None]:
df_int_lg.rename({'lang_link':'Wikipedia_lg', 'revid':'Wikipedia_rev'}, axis=1, inplace=True)
df_int_lg.Wikipedia_rev = df_int_lg.Wikipedia_rev.astype('str')

In [None]:
df_int_lg_aux = wiki_ores(df_int_lg[['Wikipedia_lg', 'Wikipedia_rev']])

In [None]:
df_int_lg = df_int_lg.merge(df_int_lg_aux[~df_int_lg_aux.major_topic.isna()][['wiki', 'revid', 'major_topic']], how='inner', left_on=['Wikipedia_lg', 'Wikipedia_rev'], right_on=['wiki', 'revid'])
df_int_lg = df_int_lg[['page_id', 'major_topic']].drop_duplicates()

In [None]:
df_pages = df_pages.merge(df_int_lg, how='left', left_on='Wikipedia_page_id', right_on='page_id')

In [None]:
df_pages.loc[df_pages['major_topic_x'].isna(),'major_topic_x'] = df_pages.loc[df_pages['major_topic_x'].isna(), 'major_topic_y']

In [None]:
df_pages.rename({'major_topic_x':'major_topic'}, axis=1, inplace=True)
df_pages.drop(['major_topic_y', 'page_id'], axis=1, inplace=True)

In [None]:
df_pages.to_csv('data/Wikipedia/pages.tsv', sep='\t', index=False, encoding='UTF-8')