## Scrape Wikipedia

In [2]:
!pip install wikipedia

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-cp37-none-any.whl size=11697 sha256=e5bcc8009e1477e8bc5d8a1aee8ff0e536b0d89bdec43ab74a72d60ef33629b6
  Stored in directory: /root/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
import wikipedia
import re
import pandas as pd

In [4]:
with open ("./data/misc/10000_articles_wikipedia_must_have.txt", encoding="utf-8") as f:
    wikipedia_urls = f.read().splitlines()

# remove possible duplicates
wikipedia_urls = list(dict.fromkeys(wikipedia_urls))
print(len(wikipedia_urls))

10858


In [5]:
def articles_scrape(wikipedia_raw):
    article_content = []
    
    # remove the last sections with references etc     
    seperator = '== See also =='
    stripped_article = wikipedia_raw.split(seperator, 1)[0]

    # Delete all section titles      
    stripped_article = re.sub(r'==.*?==+', '', stripped_article)
    
    # remove content inside parentheses (usually unecessary information for our cause)     
    stripped_article = re.sub(r'\([^)]*\)', '', stripped_article)
    
    # remove urls     
    stripped_article = re.sub(r'^https?:\/\/.*[\r\n]*', '', stripped_article, flags=re.MULTILINE)  

    # remove '\n'
    stripped_article = stripped_article.replace('\n', ' ') 

    # remove big spaces
    stripped_article = re.sub('\s{2,}', " ", stripped_article)

    for line in stripped_article.splitlines():
        stripped_line = line.strip()        
        article_content.append(stripped_line)

    return '\n'.join(article_content) + '\n'

In [6]:
scraped_articles = []

# set the simple wikipedia version to be searched for
wikipedia.set_lang("simple")

# scrape wikipedia and retrieve the content of each article for each url
for url in wikipedia_urls:
    try:
        query = wikipedia.search(url)[0]
        page = wikipedia.page(query)
    except Exception:
        pass    
    scraped_pages = articles_scrape(page.content)
    scraped_articles.append(scraped_pages)



  lis = BeautifulSoup(html).find_all('li')


## Preprocess the scraped articles

In [7]:
import pickle
import statistics
import nltk
import string
import pandas as pd
from nltk.tokenize import RegexpTokenizer

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [43]:
# this function is to avoid having mathematic-specific sentences, by controling on the number of allowed numbers that are contained
def delete_num_sentences(text):
    numbers = []
    tokenized_text = text.split()

    for word in tokenized_text:
        # some numbers might be in the form x.x or x,x
        if '.' in word or ',' in word:
            word = word.replace('.','').replace(',','')

        if word.isdigit():
            numbers.append(word)
  
    # accept until 2 numbers
    if len(numbers)<2:
        return text
    else:
        return False

In [44]:
# examine if words with non english chatacters exist
def check_all_english(text):
    english_texts = []
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_text = tokenizer.tokenize(text)

    result = all(word.encode().isalpha() for word in tokenized_text if not word.isdigit())

    if result:
        return text
    else:
        return False

In [45]:
def extra_preproces(sentences):
    cleaned_sentences = []

    for sentence in sentences:
        # find the sentences that start with a punctuation mark and skip them
        if (sentence.strip()[0] not in string.punctuation):

            # skip sentences containing digits
            sentence = delete_num_sentences(sentence)
            if (sentence):

                # skip sentences containing non-english words
                sentence = check_all_english(sentence)
                if (sentence):
          
                    # clean surrounding whitespace
                    sentence = sentence.strip()
                    # remove _
                    sentence = sentence.replace('_', ' ')
                    # remove `
                    sentence = sentence.replace('`', ' ')
                    # remove )
                    sentence = sentence.replace(')', ' ')
                    # remove [
                    sentence = sentence.replace('[', ' ')

                    # further process of some incosistencies
                    if (", ,") in sentence:
                        sentence = sentence.replace(", ,",",") 

                    # if a sentence starts with empty characters, delete them
                    if sentence[0] == ' ':
                        sentence = sentence[1:].strip()

                    cleaned_sentences.append(sentence)

  # remove empty elements of a list
  cleaned_sentences = list(filter(None, cleaned_sentences))
  
  return cleaned_sentences

In [46]:
# cleaning to retain only actual useful sentences
def filter_sentences(sentences):
    filtered_sentences = []
    length_sentences = []

    for sentence in sentences:

    # in some cases a punctuation mark would be classified as sentence 
    if sentence not in '".,;!-':
        token_sentence = nltk.word_tokenize(sentence)

        # clean rows indicating the ISBN (which was a recursive pattern in the corpus with no use)
        if ('ISBN') not in token_sentence:
            length_sentences.append(len(token_sentence))
            filtered_sentences.append(sentence)

    # returns a tuple of the filtered sentences and their lengths
    # so that we can have an insight on the average length of sentences  
    return (length_sentences, filtered_sentences)

In [47]:
corpus_list_tupled = []

# split the scraped articles in sentences
for wiki_article in scraped_articles:
    wikis_sentences = nltk.sent_tokenize(wiki_article)
    filter_wikis_sentences = filter_sentences(wikis_sentences)

    corpus_list_tupled.append(filter_wikis_sentences)

In [48]:
corpus_list_lengths = [elem[0] for elem in corpus_list_tupled]
lengths_flat_list = [item for sublist in corpus_list_lengths for item in sublist]

corpus_list_texts = [elem[1] for elem in corpus_list_tupled]  
texts_flat_list = [item for sublist in corpus_list_texts for item in sublist]

In [49]:
# calculate the average length
sentence_average_len = statistics.mean(lengths_flat_list)
print(sentence_average_len)

18.78013478422159


In [50]:
## remove possible duplicates
final_corpus = list(dict.fromkeys(texts_flat_list))

In [51]:
print(len(texts_flat_list))
print(len(final_corpus))

323480
185265


In [52]:
# expand abbreviations based on a predefined dictionary
abbr_dict={"what's":"what is", "what're":"what are", "who's":"who is", "who're":"who are", "where's":"where is", "where're":"where are", "when's":"when is", 
           "when're":"when are", "how's":"how is", "how're":"how are", "i'm":"i am", "we're":"we are", "you're":"you are", "they're":"they are", "it's":"it is",
           "he's":"he is", "she's":"she is", "that's":"that is", "there's":"there is", "there're":"there are", "i've":"i have", "we've":"we have", "you've":"you have",
    "they've":"they have", "who've":"who have", "would've":"would have", "not've":"not have", "i'll":"i will", "we'll":"we will", "you'll":"you will", "he'll":"he will",
    "she'll":"she will", "it'll":"it will", "they'll":"they will", "isn't":"is not", "wasn't":"was not", "aren't":"are not", "weren't":"were not", "can't":"can not",
    "couldn't":"could not", "don't":"do not", "didn't":"did not", "shouldn't":"should not", "wouldn't":"would not", "doesn't":"does not", "haven't":"have not",
    "hasn't":"has not", "hadn't":"had not", "won't":"will not", "what' s":"what is", "what' re":"what are", "who' s":"who is", "who' re":"who are", "where' s":"where is",
    "where' re":"where are", "when' s":"when is", "when' re":"when are", "how' s":"how is", "how' re":"how are", "i' m":"i am", "we' re":"we are", "you' re":"you are",
    "they' re":"they are", "it' s":"it is", "he' s":"he is", "she' s":"she is", "that' s":"that is", "there' s":"there is", "there' re":"there are", "i' ve":"i have",
    "we' ve":"we have", "you' ve":"you have", "they' ve":"they have", "who' ve":"who have", "would' ve":"would have", "not' ve":"not have", "i' ll":"i will", "we' ll":"we will",
    "you' ll":"you will", "he' ll":"he will", "she' ll":"she will", "it' ll":"it will", "they' ll":"they will", "isn' t":"is not", "wasn' t":"was not", "aren' t":"are not",
    "weren' t":"were not", "can' t":"can not", "couldn' t":"could not", "don' t":"do not", "didn' t":"did not", "shouldn' t":"should not", "wouldn' t":"would not",
    "doesn' t":"does not", "haven' t":"have not", "hasn' t":"has not", "hadn' t":"had not", "won' t":"will not"}

# this is just to be sure that the quotes will be those we have used in our abbreviation lexicon
quote_list = "‘’‛’❜'’`‘’"

abbr_corpus = []
for elem in final_corpus:

    # make all the single quotes, the one we have used in the abbreviation lexicon
    expand_abbr = ["'" if e in quote_list else e for e in elem]
    expand_abbr_string = ''.join(expand_abbr)
    abbr_corpus.append(expand_abbr_string)

final_corpus_df = pd.DataFrame(abbr_corpus, columns=['Sentences']) 
final_corpus_df.replace(abbr_dict,regex=True,inplace=True)
final_corpus = final_corpus_df.Sentences.tolist()

In [54]:
# further preprocessing of each sentence
final_clean_corpus = extra_preproces(final_corpus)

In [55]:
final_clean_corpus

['History is the study of past events.',
 'People know what happened in the past by looking at things from the past including sources and artifacts Libraries, archives, and museums collect and keep these things for people to study history.',
 'A person who studies history is called a historian.',
 'A person who studies pre-history and history through things left behind by ancient cultures is called an archaeologist.',
 'A person who studies mankind and society is called an anthropologist.',
 'The study of the sources and methods used to study and write history is called historiography.',
 'People can learn about the past by talking to people who remember things that happened at some point in the past.',
 'This is called oral history.',
 'For example, when people who had been slaves and American Civil War survivors got old, some historians recorded them talking about their lives, so that history would not be lost.In old times people in different parts of the world kept separate historie

In [56]:
# save the sentences of the corpus in a pickle format
with open('./data/misc/wikipedia_sentences_unparsed.pkl', 'wb') as f:
    pickle.dump(final_clean_corpus, f)