In [None]:
!pip install wikipedia setuptools pywikibot mwparserfromhell pandas numpy scipy nltk tqdm seaborn pyqt5 pyqtwebengine ruamel-yaml lxml datetime

In [135]:
import wikipedia
import re
import pywikibot
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlopen
import json
from datetime import datetime

In [3]:
wikipedia.set_lang("en")  #Limit to English WP

## Part1. Corpus builder. Define functions.

In [4]:
def get_all_the_titles(key_words : str,number_of_results):
    ''' 
    function to get the title of pages using wikipedia search with keywords
    
    :param key_words: words to effectuate the wiki research of pages
    
    :param number of results: number of pages the user want to scrap maximum 500
    :type number of results: int
    
    :returns: title of the pages and the number of results
    :rtype: list, int
    '''
    titles = []
    n=str(number_of_results)
    #the url understand + and not space
    a = key_words.lower().replace(' ', '+')
    scraped_url = 'https://en.wikipedia.org/w/index.php?title=Special:Search&limit='+n+'&offset=0&profile=default&search={}&ns0=1'.format(a)
    html_text = requests.get(scraped_url).text
    soup = BeautifulSoup(html_text, 'lxml')
    r = soup.find('div', {'class' : 'results-info'})
    nbr_result = int(r.get('data-mw-num-results-total'))
    for interest in soup.find_all('div', {'class' : 'mw-search-result-heading'}):
        titles.append(interest.find('a')['title']) 
    return titles, nbr_result

In [5]:
def get_section_titles(page : str):
    '''
    Returns the section titles

    :param page: Title of the page
    
    :returns: The list of the titles of the sections
    :rtype: list
    '''
    section_tit = set()
    section_title = []
    scraped_url = wikipedia.page(page, auto_suggest = False).url
    
    html_text = requests.get(scraped_url).text
    soup = BeautifulSoup(html_text, 'lxml')
    #find all the headings in the html text 
    r = soup.find_all('h2') + soup.find_all('h3') + soup.find_all('h4') 
    for i in r:
        if i.find('span', {'class' : 'mw-headline'}) is not None:
            section_tit.add(i.find('span', {'class' : 'mw-headline'})['id'])
    for i in section_tit:
        section_title.append(i.replace('_', ' '))        
    return section_title

In [6]:
#We define a function to automatically set up a corpus of related articles
def corpus_selection(word_keys : str , number_of_results : int, key_for_section : str):
    '''
    Select the corpus if the keywords are in the section or in the title

    :param word_keys : words to launch the wikipedia search with
    
    :param number_of_results: number of results the user wants for their corpus maximum 500
    
    :param key_for_section: key words to be found in the titles or in the sections
    
    :returns: a list containing the title of all the pages of the corpus   
    :rtype: list
    '''
    corpus = []
    proper_list = []
    left_list = []
    title_list=[]
    function = get_all_the_titles(word_keys,number_of_results)
    nbr_result = function[1]
    search_list = function[0] #Select and put in a list number_of_results wikipedia articles related to word_keys
    
    
    for article in tqdm(search_list):
        #create a new list avoiding disambiguation errors
        try:
            wikipedia.summary(article, auto_suggest = False)
            proper_list.append(article)
        except wikipedia.exceptions.DisambiguationError as e:
            pass
        #sometimes gets uncategorised errors in that case retry twice then pass
        except wikipedia.exceptions.PageError as e:
            try:
                wikipedia.summary(article, auto_suggest = False)
                proper_list.append(article)
            except wikipedia.exceptions.PageError as e:
                pass
            
    #add to the corpus the article which contains key words in their title
    print("finding pages with key words in the title")
    for proper_article in tqdm(proper_list):
        if key_for_section in proper_article.lower(): #Among these articles, select the ones whose title contains key_for_selection for the corpus
            corpus.append(proper_article)
        else:
            left_list.append(proper_article) #Put the rest of the articles in a list
    
    #add to the corpus the articles which contains key words in their sections
    print("finding pages with key words in the sections")
    for left_article in tqdm(left_list):
        sec_tit = []
        try :
            sec_tit = get_section_titles(left_article)
            for section in sec_tit:
                if key_for_section.lower() in section.lower() and left_article not in corpus:
                    corpus.append(left_article)
        except wikipedia.exceptions.DisambiguationError as e:
            pass
        except wikipedia.exceptions.PageError as e:
            try:
                sec_tit = get_section_titles(left_article)
                for section in sec_tit:
                    if key_for_section.lower() in section.lower() and left_article not in corpus:
                        corpus.append(left_article)
            except wikipedia.exceptions.PageError as e:
                pass

    return corpus

## Part1. Corpus builder. Operate.

In [9]:
%%time
# Run the corpus creation: 
# in TOI insert the Term Of Interst, use "key_for_section" for a subspecification (or same as TOI)
# diminished the number of results to 500 (instead of 5000) for faster checking
TOI = "urban heat island" 
search_list = corpus_selection(word_keys = TOI, number_of_results = 500, key_for_section = "heat island")
search_list

100%|█████████████████████████████████████████| 500/500 [00:11<00:00, 42.94it/s]


finding pages with key words in the title


100%|██████████████████████████████████████| 498/498 [00:00<00:00, 84336.55it/s]


finding pages with key words in the sections


100%|█████████████████████████████████████████| 497/497 [17:09<00:00,  2.07s/it]

CPU times: user 1min 40s, sys: 3.48 s, total: 1min 43s
Wall time: 17min 29s





['Urban heat island']

#### creation of a dataframe with informations on the pages

In [102]:
%%time
def crea_dataframe(search_list : list,keyword : str, section_search = False)-> pd.DataFrame:
    '''
    :param search_list: liste containing the name of all the articles
    :param keyword: string containing the keyword that we want to scrap
    
    :param section_search: if True extract the wikicode only of the section containing the keyword in it's name
    :type section_search: bool 
     
    :returns: data frame containg the name, the url and the wikicode of the entire page if the keyword is in the title.
    If it's not it returns the wiki code of the section containing the keyword only. Or return the wikicode for each entire page if section_search=False
    :rtype: Dataframe 
    '''
    tableau =[] #creation of a list that will contain a dictionnary for each page with the information

    
    if section_search:    
        '''
        It detects if the keyword is included in a section, a subsection or a subsubsection
        '''
        #use re to find the section in the page text
        #section
        recode= r'(?:==(?:\w|\ )*?(?:'+keyword[0].lower()+'|'+keyword[0].upper()+')'+keyword[1:]+'(?:\w|\ )*?==\n)((?:.|\n)*?)(?:==(?:\w|\ )*==\n)'
        #subsection
        recode2=r'(?:===(?:\w|\ )*?(?:'+keyword[0].lower()+'|'+keyword[0].upper()+')'+keyword[1:]+'(?:\w|\ )*?===\n)((?:.|\n)*?)(?:(?:===|==)(?:\w|\ )*(?:===|==)\n)'
        #subsubsection
        recode3=r'(?:====(?:\w|\ )*?(?:'+keyword[0].lower()+'|'+keyword[0].upper()+')'+keyword[1:]+'(?:\w|\ )*?====\n)((?:.|\n)*?)(?:(?:===|==|====)(?:\w|\ )*(?:===|==|====)\n)'
    
    #browse all the titles  of the search list
    for i in tqdm(range(len(search_list))):
             
        #find the wikipedia page
        page = wikipedia.page(search_list[i], auto_suggest = False)
        page_title =page.title #give the clean name of the page
        page_url= page.url #give the url of the page
        site = pywikibot.Site("en", "wikipedia")
        page = pywikibot.Page(site, page_title)    
                
        if section_search:
            if keyword.lower() in page_title.lower(): 
                page_text=page.text
            else:
                page_text = str(re.findall(recode , page.text))
            if page_text=="[]":
                page_text = str(re.findall(recode2, page.text))
            if page_text== "[]":
                page_text = str(re.findall(recode3, page.text))
        else:
            page_text=page.text
            
        infopage ={'Name only' : page_title, 'page url' :page_url,'text':page_text}
        tableau.append(infopage)
    df=pd.DataFrame.from_dict(tableau)
    return df


df=crea_dataframe(search_list,TOI)
df




100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.46s/it]


CPU times: user 41.7 ms, sys: 108 ms, total: 150 ms
Wall time: 1.65 s


Unnamed: 0,Name only,page url,text
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...


## Adding WikiData page links

In [106]:
def get_wikidata_url(wikipedia_url):
    # Extract the article title from the URL
    title = wikipedia_url.split('/wiki/')[-1]
    # Query the Wikidata API
    response = requests.get(f'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={title}&format=json')
    data = response.json()
    entities = data.get('entities')
    if entities:
        entity_id = list(entities.keys())[0]
        if entity_id != '-1':
            return f'https://www.wikidata.org/wiki/{entity_id}'
    return ''

df_WD = df.copy()
# Apply the function to the DataFrame
df_WD['wikidata_url'] = df_WD['page url'].apply(get_wikidata_url)

df_WD

Unnamed: 0,Name only,page url,text,wikidata_url
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712


## Adding the Creation Dates of Wikipedia and Wikidata pages

In [109]:
# Function to get Wikipedia article creation date
def get_wikipedia_creation_date(page_url):
    if pd.isna(page_url):
        return None
    page_title = page_url.split('/')[-1]
    endpoint = f"https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvlimit=1&rvdir=newer&titles={page_title}&format=json"

    response = requests.get(endpoint)
    data = response.json()
    page_id = next(iter(data['query']['pages']))

    if 'revisions' in data['query']['pages'][page_id]:
        creation_date = data['query']['pages'][page_id]['revisions'][0]['timestamp']
        creation_date = datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%SZ')
        return creation_date
    else:
        return None


# Function to get Wikidata item creation date
def get_wikidata_creation_date(wikidata_url):
    if pd.isna(wikidata_url):
        return None
    entity_id = wikidata_url.split('/')[-1]
    endpoint = f"https://www.wikidata.org/w/api.php?action=query&prop=revisions&rvlimit=1&rvdir=newer&titles=Item:{entity_id}&format=json"

    response = requests.get(endpoint)
    data = response.json()
    page_id = next(iter(data['query']['pages']))

    if 'revisions' in data['query']['pages'][page_id]:
        creation_date = data['query']['pages'][page_id]['revisions'][0]['timestamp']
        creation_date = datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%SZ')
        return creation_date
    else:
        return None

df_DOB = df_WD.copy()
# Add new columns for creation dates
df_DOB['Wikipedia Creation Date'] = df_DOB['page url'].apply(get_wikipedia_creation_date)
df_DOB['Wikidata Creation Date'] = df_DOB['wikidata_url'].apply(get_wikidata_creation_date)

df_DOB

Unnamed: 0,Name only,page url,text,wikidata_url,Wikipedia Creation Date,Wikidata Creation Date
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712,2001-12-06 02:06:31,2012-11-30 10:10:43


## Adding the first level of WD properties 

In [115]:
# Use the correct column name for Wikidata URLs
wikidata_url_column = 'wikidata_url'  # Update this if the column name is different

# Function to fetch Wikidata properties
def fetch_wikidata_properties(wikidata_url):
    if pd.isna(wikidata_url):
        return {}, {}, {}

    entity_id = wikidata_url.split('/wiki/')[-1]
    url = f'https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json'
    response = requests.get(url)
    data = response.json()
    claims = data['entities'][entity_id]['claims']

    instance_of = claims.get('P31', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', '')
    part_of = claims.get('P361', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', '')
    subclass_of = claims.get('P279', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', '')

    return instance_of, part_of, subclass_of

df_first_level = df_WD.copy()

# Fetch properties for each Wikidata item and add them to the DataFrame
df_first_level[['instance_of', 'part_of', 'subclass_of']] = df_first_level[wikidata_url_column].apply(
    lambda url: pd.Series(fetch_wikidata_properties(url)))

df_first_level

Unnamed: 0,Name only,page url,text,wikidata_url,wikidata_id,instance_of,part_of,subclass_of
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712,Q215712,Q483247,,Q702492


In [None]:
## Retrieving the whole hierarchy of "subclass of"

In [113]:
# Extract Wikidata IDs from the 'wikidata_url' column
df_WD['wikidata_url'] = df_WD['wikidata_url'].astype(str)
df_WD['wikidata_id'] = df_WD['wikidata_url'].apply(lambda x: re.search(r'Q\d+', x).group() if re.search(r'Q\d+', x) else None)


# Function to get the "subclass of" hierarchy for a given Wikidata item ID (iterative approach)
def get_subclass_of_hierarchy(item_id):
    hierarchy = []
    stack = [item_id]  # Using a stack for iterative depth-first search
    while stack:
        current_id = stack.pop()
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={current_id}&format=json&props=claims"
        response = requests.get(url).json()
        if 'entities' in response and current_id in response['entities']:
            claims = response['entities'][current_id].get('claims', {})
            if 'P279' in claims:  # P279 is "subclass of"
                subclass_of_ids = [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id') for claim in claims['P279'] if claim['mainsnak'].get('datavalue')]
                for subclass_of_id in subclass_of_ids:
                    if subclass_of_id not in hierarchy:
                        hierarchy.append(subclass_of_id)
                        stack.append(subclass_of_id)
    return hierarchy

results = []
for i, wikidata_id in enumerate(tqdm(df_WD['wikidata_id'], desc='Processing', unit='item')):
    if wikidata_id:
        hierarchy = get_subclass_of_hierarchy(wikidata_id)
    else:
        hierarchy = []
    results.append(hierarchy)

# Add the results to the DataFrame
df_subclass_of = df_DOB.copy()
df_subclass_of['subclass_of_hierarchy'] = results

df_subclass_of

Processing: 100%|███████████████████████████████| 1/1 [00:23<00:00, 23.79s/item]


Unnamed: 0,Name only,page url,text,wikidata_url,Wikipedia Creation Date,Wikidata Creation Date,subclass_of_hierarchy
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712,2001-12-06 02:06:31,2012-11-30 10:10:43,"[Q702492, Q82794, Q486972, Q618123, Q123964505..."


## Retrieving the whole hierarchy of "part of"

In [116]:
# Function to get the "part of" hierarchy for a given Wikidata item ID (iterative approach)
def get_part_of_hierarchy(item_id):
    hierarchy = []
    stack = [item_id]  # Using a stack for iterative depth-first search
    while stack:
        current_id = stack.pop()
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={current_id}&format=json&props=claims"
        response = requests.get(url).json()
        if 'entities' in response and current_id in response['entities']:
            claims = response['entities'][current_id].get('claims', {})
            if 'P361' in claims:  # P361 is "part of"
                part_of_ids = [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id') for claim in claims['P361'] if claim['mainsnak'].get('datavalue')]
                for part_of_id in part_of_ids:
                    if part_of_id not in hierarchy:
                        hierarchy.append(part_of_id)
                        stack.append(part_of_id)
    return hierarchy

results_part_of = []

for i, wikidata_id in enumerate(tqdm(df_WD['wikidata_id'], desc='Processing', unit='item')):
    if wikidata_id:
        hierarchy = get_part_of_hierarchy(wikidata_id)
    else:
        hierarchy = []
    results_part_of.append(hierarchy)

# Add the results to the DataFrame
df_part_of = df_DOB.copy()
df_part_of['part_of_hierarchy'] = results_part_of

df_part_of

Processing: 100%|███████████████████████████████| 1/1 [00:00<00:00,  1.97item/s]


Unnamed: 0,Name only,page url,text,wikidata_url,Wikipedia Creation Date,Wikidata Creation Date,part_of_hierarchy
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712,2001-12-06 02:06:31,2012-11-30 10:10:43,[]


## Retrieving the whole hierarchy of "instance of"

In [117]:
# Function to get the "instance of" hierarchy for a given Wikidata item ID (iterative approach)
def get_instance_of_hierarchy(item_id):
    hierarchy = []
    stack = [item_id]  # Using a stack for iterative depth-first search
    while stack:
        current_id = stack.pop()
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={current_id}&format=json&props=claims"
        response = requests.get(url).json()
        if 'entities' in response and current_id in response['entities']:
            claims = response['entities'][current_id].get('claims', {})
            if 'P31' in claims:  # P31 is "instance of"
                instance_of_ids = [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id') for claim in claims['P31'] if claim['mainsnak'].get('datavalue')]
                for instance_of_id in instance_of_ids:
                    if instance_of_id not in hierarchy:
                        hierarchy.append(instance_of_id)
                        stack.append(instance_of_id)
    return hierarchy

results_instance_of = []

for i, wikidata_id in enumerate(tqdm(df_WD['wikidata_id'], desc='Processing', unit='item')):
    if wikidata_id:
        hierarchy = get_instance_of_hierarchy(wikidata_id)
    else:
        hierarchy = []
    results_instance_of.append(hierarchy)

# Add the results to the DataFrame
df_instance_of = df_WD.copy()
df_instance_of['instance_of_hierarchy'] = results_instance_of
df_instance_of

Processing: 100%|███████████████████████████████| 1/1 [00:00<00:00,  1.05item/s]


Unnamed: 0,Name only,page url,text,wikidata_url,wikidata_id,instance_of_hierarchy
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712,Q215712,[Q483247]


## Retrieving labels of the Wikidata articles

In [134]:
# Function to get a label for a Wikidata ID
def get_wikidata_label(wikidata_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        try:
            return data['entities'][wikidata_id]['labels']['en']['value']
        except KeyError:
            return None
    else:
        return None

# Define column names to process
columns_to_process = df_first_level[['instance_of', 'part_of', 'subclass_of']] #change to the necessary columns

# Extract unique Wikidata IDs from the selected columns
unique_wikidata_ids = pd.unique(columns_to_process.values.ravel('K')).tolist()
unique_wikidata_ids = [x for x in unique_wikidata_ids if pd.notna(x)]

# Get labels for all unique Wikidata IDs
wikidata_labels = {wid: get_wikidata_label(wid) for wid in unique_wikidata_ids}

# Replace Wikidata IDs with their labels in the selected columns
df_labeled = df_first_level.copy()

for column in columns_to_process:
    df_labeled[column] = df_labeled[column].apply(lambda wid: wikidata_labels.get(wid, wid))

df_labeled

Unnamed: 0,Name only,page url,text,wikidata_url,wikidata_id,instance_of,part_of,subclass_of
0,Urban heat island,https://en.wikipedia.org/wiki/Urban_heat_island,{{Short description|Situation where cities are...,https://www.wikidata.org/wiki/Q215712,Q215712,phenomenon,,urban area
