# Welsh Summarizer
### Install Required Packages

In [181]:
# !pip install wikipedia-api
# !pip install wikipedia
# !pip install ftfy==3.0
# !pip install --default-timeout=1000 future

### Import modules

In [182]:
import os
import wikipedia
import wikipediaapi
import ftfy
import shutil

### Define functions

In [183]:
# Get a wikipedia pages given a set of keywords
def get_wiki_pages(keywords, results=10):
    wikipedia.set_lang("cy")
    page_names = []
    for word in keywords: 
        page_names.extend(wikipedia.search(word, results=results))
    return page_names

# Check that page exists
def wiki_page_exist(page_title, lang="cy"):
    wiki_lang = wikipediaapi.Wikipedia(lang)
    title = ftfy.fix_text(page_title)
    return wiki_lang.page(page_title).exists()

# return lines of text in a file
def readfile(filename):
    with open(filename, "r", encoding="utf8") as fileObj:
        fileLines = [keyword.strip() for keyword in fileObj.readlines()
    return fileLines

# check if min_token_count is met
def count_tokens(text, min_token_count = 500):
    return len(text.split())>=min_token_count

### Read and combine file contents

In [212]:
# read the wiki pages file
wiki_pages = readfile("welsh_wiki_pages.txt")

# read most edited wiki pages
most_edited_list = [line.split(',')[0] for line in readfile("20210623 Most edited.csv")]

# get 100 wiki pages from available keywords
keyword_pages = get_wiki_pages(readfile("welsh_keywords.txt")[1:], 100)

# combine all page without repeat
all_wiki_pages = set(keyword_pages + wiki_pages + most_edited_list)

### Check existing and missing pages

In [213]:
# get existing pages on Wikipedia
existing_wiki_pages = []
for i, page in enumerate(all_wiki_pages):
    if wiki_page_exist(page):
        existing_wiki_pages.append(page)
    print(end=f"\r{i:4d} of {len(all_wiki_pages)} pages checked")

# also missing pages
missing_wiki_pages = list(all_wiki_pages.difference(existing_wiki_pages))

# create a new file for the existing working list
wiki_pages_file = "working_wiki_pages.txt"
with open(wiki_pages_file, "w", encoding="utf8") as working_wiki:
    for page in existing_wiki_pages:
        working_wiki.write(f"{page}\n")

2521 of 2522 pages checked

### Print details

In [214]:
print(f"keyword_pages = {len(keyword_pages)}")
print(f"wiki_pages = {len(wiki_pages)}")
print(f"most_edited_list = {len(most_edited_list)}")
print(f"all_wiki_pages = {len(all_wiki_pages)}")
print(f"existing_wiki_pages = {len(existing_wiki_pages)}")
print(f"missing_wiki_pages = {len(missing_wiki_pages)}")

keyword_pages = 1800
wiki_pages = 806
most_edited_list = 100
all_wiki_pages = 2522
existing_wiki_pages = 2513
missing_wiki_pages = 9


In [224]:
missing_wiki_pages

['User:V(g)',
 '"Huey',
 'GlÃ¶yn_byw',
 '"Etholiad_Senedd_Cymru',
 'BibliothÃ¨que_nationale_de_France',
 '"Metrolink',
 'CÃ¢n_i_Gymru',
 '"Etholiad_Senedd_Ewrop',
 'CÃ¢n_i_Gymru_2021']

 ### 2. Define the text extractor function

In [223]:
def extract_text(titles, data_dir="data", lang="cy" ):
    # Extract Wiki text
    wiki = wikipediaapi.Wikipedia(language=lang,
            extract_format=wikipediaapi.ExtractFormat.WIKI)

    html = wikipediaapi.Wikipedia(language=lang,
            extract_format=wikipediaapi.ExtractFormat.HTML)
    
    # Create directory for text files
    text_dir = os.path.join(".", data_dir, "text")
    if not os.path.exists(text_dir): os.makedirs(text_dir)
    
    # Create directory for summary files
    summary_dir = os.path.join(".", data_dir, "summary")
    if not os.path.exists(summary_dir): os.makedirs(summary_dir)
    
    # Create directory for html files
    html_dir = os.path.join(".", data_dir, "html")
    if not os.path.exists(html_dir): os.makedirs(html_dir)                              

    file_id = 0
    for i, name in enumerate(titles):
        print(end=f"\r processing file '{i+1:4d} of {len(titles)} pages")
        wiki_page = wiki.page(name)
        html_page = html.page(name)
        
        if count_tokens(wiki_page.text):
            try:
                file_id +=1
                wiki_title = f"<title>\n{wiki_page.title}\n</title>"
                wiki_text = f"<text>\n{wiki_page.text}\n</text>"
                wiki_summary = f"<summary>\n{wiki_page.summary}\n</summary>"
                wiki_categories = f"<categories>\n{wiki_page.categories}\n</categories>"
                
                text_file_name = os.path.join(text_dir,f"{file_id}_{name.split('_')[0]}.txt")
                text_file_name = text_file_name.replace(":","_")
                text_file_name = text_file_name.replace("/","_")
                with open(text_file_name, 'w', encoding="utf8") as f:
                    f.write(f"{wiki_title}\n\n{wiki_text}\n\n{wiki_categories}")
                
                summary_file_name = os.path.join(summary_dir,f"{file_id}_{name.split('_')[0]}_summary.txt")
                summary_file_name = summary_file_name.replace(":","_")
                summary_file_name = summary_file_name.replace("/","_")
                with open(summary_file_name, 'w', encoding="utf8") as f:
                    f.write(f"{wiki_title}\n\n{wiki_summary}") 

                html_title = f"<h1>begin title</h1>\n{html_page.title}\n<h1>end title</h1>"
                html_text = f"<h1>begin text</h1>\n{html_page.text}\n<h1>end text</h1>"
                html_summary = f"<h1>begin summary</h1>\n{html_page.summary}\n<h1>end summary</h1>"
                html_categories = f"<h1>begin categories</h1>\n{html_page.categories}\n<h1>end categories</h1>"

                html_file_name = os.path.join(html_dir,f"{file_id}_{name.split('_')[0]}.html")
                html_file_name = html_file_name.replace(":","_")
                html_file_name = html_file_name.replace("/","_")
                with open(html_file_name, 'w', encoding="utf8") as f:
                    f.write(f"{html_title}\n\n{html_text}\n\n{html_summary}\n\n{html_categories}")
            except OSError:
                print(f"Could not write text in {text_file_name}.")

### 3. Extract and zip text and summary files

In [222]:
# Extract text and summary files
extract_text(existing_wiki_pages[2198:]) # read this from the file "working_wiki_pages.txt"

# Create a zip archive of the data directory
shutil.make_archive("data", 'zip', "data")

# Remove the data folder
shutil.rmtree("data")

 processing file ' 315 of 315