### Loading libs

In [1]:
import pandas as pd
import wikipediaapi as wk

In [2]:
wiki = wk.Wikipedia('en', extract_format=wk.ExtractFormat.WIKI)

In [3]:
# List of decades, centuries, millennia
query_string = "List of decades, centuries, and millennia"
lofdcm = wiki.page(query_string)

In [4]:
lofdcm.sections

[Section: Notes (1):
 
 Subsections (0):,
 Section: See also (1):
 List of years
 Timelines of world history
 List of timelines
 Chronology
 See calendar and list of calendars for other groupings of years.
 See history, history by period, and periodization for different organizations of historical events.
 For earlier time periods, see Timeline of the Big Bang, Geologic time scale, Timeline of evolution, and Logarithmic timeline.
 Subsections (0):]

### Exploring Wikipedia Obj

In [5]:
print(lofdcm.text)

This is a list of decades, centuries, and millennia from 10,000 BC to 10,000 AD, including links to corresponding articles with more information about them.

Notes
See also
List of years
Timelines of world history
List of timelines
Chronology
See calendar and list of calendars for other groupings of years.
See history, history by period, and periodization for different organizations of historical events.
For earlier time periods, see Timeline of the Big Bang, Geologic time scale, Timeline of evolution, and Logarithmic timeline.


In [6]:
# help(lofdcm)
lofdcm.backlinks

{'List of centuries and millennia': List of centuries and millennia (id: ??, ns: 0),
 'Century': Century (id: ??, ns: 0),
 'List of days of the year': List of days of the year (id: ??, ns: 0),
 'Library of Congress Classification': Library of Congress Classification (id: ??, ns: 0),
 'Wikipedia:Contents/Outlines': Wikipedia:Contents/Outlines (id: ??, ns: 4),
 '1960s': 1960s (id: ??, ns: 0),
 '2000': 2000 (id: ??, ns: 0),
 '1977': 1977 (id: ??, ns: 0),
 '1964': 1964 (id: ??, ns: 0),
 '2001': 2001 (id: ??, ns: 0),
 '1999': 1999 (id: ??, ns: 0),
 '1970s': 1970s (id: ??, ns: 0),
 '1990s': 1990s (id: ??, ns: 0),
 '1980s': 1980s (id: ??, ns: 0),
 '1040': 1040 (id: ??, ns: 0),
 '1950s': 1950s (id: ??, ns: 0),
 '1870s': 1870s (id: ??, ns: 0),
 '1952': 1952 (id: ??, ns: 0),
 '1984': 1984 (id: ??, ns: 0),
 '2000s': 2000s (id: ??, ns: 0),
 'AD 26': AD 26 (id: ??, ns: 0),
 '1940s': 1940s (id: ??, ns: 0),
 '1947': 1947 (id: ??, ns: 0),
 '1624': 1624 (id: ??, ns: 0),
 '1626': 1626 (id: ??, ns: 0),
 

In [7]:
import re
patt = r"(19\d0s|20\d0s.*)"

In [8]:
backlinks = {k:v for k ,v in lofdcm.backlinks.items() if re.search(patt, k)}

In [9]:
links= {k:v for k ,v in lofdcm.links.items() if re.search(patt, k)}

In [10]:
links.keys()

dict_keys(['1900s (decade)', '1910s', '1920s', '1930s', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s (decade)', '2010s', '2020s'])

In [11]:
print(type(links["1910s"]), end="\n\n")

<class 'wikipediaapi.WikipediaPage'>



In [12]:
# dir(links["1910s"])
subsection = links["1910s"].section_by_title("Boxing")
# sect_map = links["1910s"]._section_mapping
# sect_map.keys()
subsection

Section: Boxing (3):
Jack Dempsey
Jess Willard
Subsections (0):

## Core sections extraction

Function below extractions the key sections of interest with their subsections.

The plan is to:

>    1. `Call the function on each decade`
>    2. `Subset the main ._section_mapping dict to extract key texts OR`
>   3. `Use the .section_by_title() method to extract text using returned dict from core_sections`

In [21]:
def load_sections(decade)-> tuple:
    """
    Loads the sections of wiki page
    """
    all_section_dict = links[decade]._section_mapping
    main_sections = links[decade].sections
    
    return all_section_dict, main_sections

In [20]:
def core_sections_extractor(decade:str) -> dict:
    
    """
    Params: decade
    Returns dict of sections_title of key interest that will later be used to extract a sections content
        main_section_title : list of subsections
    """
    all_section_dict, main_sections = load_sections(decade)
    
    # Sections to ignore. See also is not included because it will act as a boundary
    notinclude = {'Pronunciation varieties','Further reading','References','External links'}
    
    # All sections. Main, Subsections and Sections to ignore
    all_section_titles = list(all_section_dict.keys())
    
    # Only the core sections
    core_section_titles = [s.title for s in main_sections if s.title not in notinclude]
    
    # Storing the core section indices according to their position in the all_section_title list
    core_indices = {k: all_section_titles.index(k) for k in  core_section_titles}
    
    # Convinience variable 
    indices_lst = list(core_indices.keys())
    
    
    # Store the core title with a list of its subsections
    #  core_indices = {"People": 14, "See Also": 16} - # People is on index 14 on all_section_index with possible 2 subsections
    # index_lst = ["People", "See Also"] - People is at index 0 of core_indices.keys()
    # main_with_sub = {"People":["World Leaders", "Business Leaders"]}
    core_sections_with_subs = {indices_lst[i]:all_section_titles[core_indices[indices_lst[i]]+1: core_indices[indices_lst[i+1]]] for i in range(len(indices_lst)-1)}
    
    return core_sections_with_subs

In [18]:
tst_decade = "1910s"

In [91]:
core_dict = core_sections_extractor(tst_decade)

In [92]:
core_dict

{'Politics and wars': ['Wars',
  'Internal conflicts',
  'Major political changes',
  'Decolonization and independence',
  'Prominent political events'],
 'Assassinations and attempts': [],
 'Disasters': [],
 'Other significant international events': [],
 'Science and technology': ['Technology', 'Science'],
 'Economics': [],
 'Popular culture': ['Sports',
  'Literature and arts',
  'Visual Arts',
  'Art movements',
  'Cubism and related movements',
  'Expressionism and related movements',
  'Geometric abstraction and related movements',
  'Other movements and techniques',
  'Influential artists'],
 'People': ['World leaders',
  'Politics',
  'Business',
  'Inventors',
  'Authors',
  'Entertainers',
  'Sports figures',
  'Baseball',
  'Olympics',
  'Boxing']}

### Core Text extraction

In [24]:
tens19 = links[tst_decade]

In [125]:
# dir(tens19.section_by_title("Wars"))
print(tens19.section_by_title("Wars").full_text(), end="\n\n")
print(tens19.section_by_title("Wars").text, end="\n\n")

Wars
World War I (1914–1918)
Assassination of Archduke Franz Ferdinand of Austria-Hungary in Sarajevo leads to the outbreak of the First World War
Germany signs the Treaty of Versailles after losing the first world war.
Armenian Genocide during and just after World War I. It was characterised by the use of massacres and deportations involving forced marches under conditions designed to lead to the death of the deportees, with the total number of Armenian deaths generally held to have been between one and one-and-a-half million.
Wadai War (1909–1911)
Italo-Turkish War (1911–1912)
First Balkan Wars (1912–1913) – two wars that took place in South-eastern Europe in 1912 and 1913.
Saudi-Ottoman War (1913)
Latvian War of Independence (1918-1920) – a military conflict in Latvia between the Republic of Latvia and the Russian SFSR.



World War I (1914–1918)
Assassination of Archduke Franz Ferdinand of Austria-Hungary in Sarajevo leads to the outbreak of the First World War
Germany signs the Tr

In [37]:
tens19.summary

'The 1910s (pronounced "nineteen-teens", commonly abbreviated as the "Teens") was a decade of the Gregorian calendar that began on January 1, 1910, and ended on December 31, 1919. It was the second decade of the 20th century\nThe 1910s represented the culmination of European militarism which had its beginnings during the second half of the 19th century. The conservative lifestyles during the first half of the decade, as well as the legacy of military alliances, was forever changed by the assassination, on June 28, 1914, of Archduke Franz Ferdinand, the heir presumptive to the Austro-Hungarian throne. The murder triggered a chain of events in which, within 33 days, World War I broke out in Europe on August 1, 1914. The conflict dragged on until a truce was declared on November 11, 1918, leading to the controversial, one-sided Treaty of Versailles, which was signed on June 28, 1919.\nThe war\'s end triggered the abdication of various monarchies and the collapse of five of the last modern

In [49]:
tens19_dict = tens19._section_mapping
all_section_df = pd.DataFrame.from_dict(tens19_dict, orient="index", columns=["text"]).reset_index().rename(columns = {"index": "sections"})

In [93]:
all_section_df["decade"] = "1990s"
all_section_df

Unnamed: 0,sections,text,decade
0,Politics and wars,Section: Politics and wars (1):\n\nSubsections...,1990s
1,Wars,Section: Wars (2):\nWorld War I (1914–1918)\nA...,1990s
2,Internal conflicts,Section: Internal conflicts (2):\nOctober Revo...,1990s
3,Major political changes,Section: Major political changes (2):\nPortuga...,1990s
4,Decolonization and independence,Section: Decolonization and independence (2):\...,1990s
5,Prominent political events,Section: Prominent political events (2):\n\nSu...,1990s
6,Assassinations and attempts,Section: Assassinations and attempts (1):\nPro...,1990s
7,Disasters,"Section: Disasters (1):\nThe RMS Titanic, a Br...",1990s
8,Other significant international events,Section: Other significant international event...,1990s
9,Science and technology,Section: Science and technology (1):\n\nSubsec...,1990s


In [126]:
core_titles = core_dict.keys()
core_tens19 = dict(zip(core_titles, map(tens19_dict.get, core_titles)))
core_sections_df = pd.DataFrame.from_dict(core_tens19, orient="index", columns=["text"]).reset_index().rename(columns = {"index": "sections"})
core_sections_df

Unnamed: 0,sections,text
0,Politics and wars,Section: Politics and wars (1):\n\nSubsections...
1,Assassinations and attempts,Section: Assassinations and attempts (1):\nPro...
2,Disasters,"Section: Disasters (1):\nThe RMS Titanic, a Br..."
3,Other significant international events,Section: Other significant international event...
4,Science and technology,Section: Science and technology (1):\n\nSubsec...
5,Economics,Section: Economics (1):\nIn the years 1910 and...
6,Popular culture,Section: Popular culture (1):\nFlying Squadron...
7,People,Section: People (1):\n\nSubsections (7):\nSect...


In [127]:
def get_substitle(val):
    res = val
    if core_dict[val]:
        res = core_dict[val]
    return res

def get_subtext(val):
    return tens19.section_by_title(val).full_text()

In [128]:
core_sections_df["sub_section"] =  core_sections_df["sections"].apply(get_subs)

In [136]:
df_1910s = core_sections_df.explode("sub_section", ignore_index=True)

In [137]:
df_1910s["text"] = core_df["sub_section"].apply(get_subtext)

In [138]:
df_1910s

Unnamed: 0,sections,text,sub_section
0,Politics and wars,Wars\nWorld War I (1914–1918)\nAssassination o...,Wars
1,Politics and wars,Internal conflicts\nOctober Revolution in Russ...,Internal conflicts
2,Politics and wars,Major political changes\nPortugal becomes the ...,Major political changes
3,Politics and wars,Decolonization and independence\nEaster Rising...,Decolonization and independence
4,Politics and wars,Prominent political events\n,Prominent political events
5,Assassinations and attempts,Assassinations and attempts\nProminent assassi...,Assassinations and attempts
6,Disasters,"Disasters\nThe RMS Titanic, a British ocean li...",Disasters
7,Other significant international events,Other significant international events\nThe Pa...,Other significant international events
8,Science and technology,Technology\nGideon Sundback patented the first...,Technology
9,Science and technology,"Science\nIn 1916, Albert Einstein's theory of ...",Science


The above format will allow us to drop rows where the text does not contain a year

## Sample csv

In [139]:
df_1990s.to_csv("1910s.csv", index=False)