# Cleaning in Code

In [15]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [5]:
# Note: Assume all words are seperated by spaces.
def ngrams(input_text, n):
    input_text = input_text.split(' ')
    output = []
    for i in range(len(input_text) - n + 1):
        output.append(input_text[i:i+n])
    return output

In [12]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs_obj = BeautifulSoup(html)
content = bs_obj.find('div', {'id':'mw-content-text'}).get_text()
n_grams = ngrams(content, 2)
print(n_grams[:30]) # show an example
print('2-grams count is ' + str(len(n_grams)))

[['For', 'other'], ['other', 'uses,'], ['uses,', 'see'], ['see', 'Python.\nGeneral-purpose,'], ['Python.\nGeneral-purpose,', 'high-level'], ['high-level', 'programming'], ['programming', 'language\n\n\nPythonParadigmMulti-paradigm:'], ['language\n\n\nPythonParadigmMulti-paradigm:', 'functional,'], ['functional,', 'imperative,'], ['imperative,', 'object-oriented,'], ['object-oriented,', 'reflectiveDesigned\xa0byGuido'], ['reflectiveDesigned\xa0byGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirst\xa0appeared1990;'], ['FoundationFirst\xa0appeared1990;', '29\xa0years'], ['29\xa0years', 'ago\xa0(1990)[1]Stable'], ['ago\xa0(1990)[1]Stable', 'release3.8.0'], ['release3.8.0', '/'], ['/', '14\xa0October'], ['14\xa0October', '2019;'], ['2019;', '2'], ['2', 'months'], ['months', 'ago\xa0(2019-10-14)[2]2.7.17'], ['ago\xa0(2019-10-14)[2]2.7.17', '/'], ['/', '19\xa0October'], ['19\xa0October', '2019;'], ['2019;', '59'], ['59', 'days

As you can see, sometimes it works, but sometimes it doesn't. It does not seperated two words splitted by the special character '\n'. We can fix this issue by using some regular expressions. We can also removing any Unicode characters by using filtering.

In [31]:
def ngrams2(content, n):
    content = re.sub('\n+', ' ', content)
    content = re.sub(' +', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    #print(content)
    content = content.split(' ')
    output = []
    for i in range(len(content) - n + 1):
        output.append(content[i:i+n])
    return output

In [33]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs_obj = BeautifulSoup(html)
content = bs_obj.find('div', {'id':'mw-content-text'}).get_text()
n_grams = ngrams2(content, 2)
print(n_grams[:30]) # show an example
print('2-grams count is ' + str(len(n_grams)))

[['For', 'other'], ['other', 'uses,'], ['uses,', 'see'], ['see', 'Python.'], ['Python.', 'General-purpose,'], ['General-purpose,', 'high-level'], ['high-level', 'programming'], ['programming', 'language'], ['language', 'PythonParadigmMulti-paradigm:'], ['PythonParadigmMulti-paradigm:', 'functional,'], ['functional,', 'imperative,'], ['imperative,', 'object-oriented,'], ['object-oriented,', 'reflectiveDesignedbyGuido'], ['reflectiveDesignedbyGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirstappeared1990;'], ['FoundationFirstappeared1990;', '29years'], ['29years', 'ago(1990)[1]Stable'], ['ago(1990)[1]Stable', 'release3.8.0'], ['release3.8.0', '/'], ['/', '14October'], ['14October', '2019;'], ['2019;', '2'], ['2', 'months'], ['months', 'ago(2019-10-14)[2]2.7.17'], ['ago(2019-10-14)[2]2.7.17', '/'], ['/', '19October'], ['19October', '2019;'], ['2019;', '59']]
2-grams count is 10044


It is better, but there are still some more rules we can add to get closer to ideal data.
1. Single character words, e.g. 'I', 'a', should be discarded
2. Wikipedia citation marks should be discarded
3. Punctuation marks should be discarded (for simplification, just now)

In [34]:
import string

In [41]:
def clean_input(input_text):
    input_text = re.sub('\n+', ' ', input_text)
    input_text = re.sub('\[[0-9]*\]', ' ', input_text)
    input_text = re.sub(' +', ' ', input_text)
    input_text = bytes(input_text, 'UTF-8')
    input_text = input_text.decode('ascii', 'ignore')
    clean_input = []
    input_text = input_text.split(' ')
    
    for item in input_text:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            clean_input.append(item)
    return clean_input

def ngrams3(input_text, n):
    input_text = clean_input(input_text)
    output = {}
    for i in range(len(input_text) - n + 1):
        ngram_temp = ' '.join(input_text[i:i+n])
        if ngram_temp not in output:
            output[ngram_temp] = 0
        output[ngram_temp] += 1
    return output

In [44]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs_obj = BeautifulSoup(html)
content = bs_obj.find('div', {'id':'mw-content-text'}).get_text()
n_grams = ngrams3(content, 2)
#print(n_grams) # show an example
print('2-grams count is ' + str(len(n_grams)))

2-grams count is 7350


# Data Normalization

Data Normalization is the process of ensuring that strings that are linguistically or logically equivalent to each other, shuch as the phone numbers (+66)812345678 and +66812345678, are displayed as equivalent.

In [37]:
from collections import OrderedDict

In [45]:
n_grams = ngrams3(content,2)
n_grams = OrderedDict(sorted(n_grams.items()
                            , key = lambda t: t[1], reverse = True))
print(n_grams)

OrderedDict([('Software Foundation', 40), ('Python Software', 38), ('of the', 36), ('Foundation Retrieved', 32), ('in Python', 25), ('of Python', 24), ('in the', 23), ('from the', 23), ('van Rossum', 21), ('to the', 20), ('such as', 19), ('Retrieved 24', 17), ('as a', 16), ('February 2012', 16), ('the Python', 15), ('is a', 15), ('Python Enhancement', 15), ('Enhancement Proposals', 14), ('Proposals Python', 14), ('Python is', 13), ('can be', 13), ('Archived from', 13), ('the original', 13), ('to Python', 12), ('be used', 12), ('for Python', 11), ('Python Python', 11), ('of a', 11), ('original on', 11), ('Rossum Guido', 11), ('programming language', 10), ('Guido van', 10), ('standard library', 10), ('for the', 10), ('used to', 10), ('Retrieved 19', 10), ('and a', 9), ('the language', 9), ('Python and', 9), ('with the', 9), ('to be', 9), ('statement which', 9), ('In Python', 9), ('other languages', 8), ('written in', 8), ('for example', 8), ('Python has', 8), ('on the', 8), ('November 20