In [85]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).find_all('p')
content = [p.get_text() for p in content]


In [86]:
content = ''.join(content)

In [87]:
print(content)



Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation via the off-side rule.[34]
Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[35][36]
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[37] Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.[38]
Python consistently ranks as one of the most popular programming languages.[39][40][41][42]
Python was conceived in the late 1980s[43] by Guido van Rossum

In [102]:
import re
import string 
import unicodedata

# Must be called before split_sentences
def replace_newlines(text):
    return text.replace('\n', ' ')

def make_lowercase(text):
    return text.lower()

CITATION_REGEX = re.compile('\[[0-9]*\]')
def strip_citations(text):
    return re.sub(CITATION_REGEX, '', text)


def split_sentences(text):
    return [s.strip() for s in text.split('. ')]

PARENS_REGEX = re.compile('\([a-z A-Z \+\.,\-]{0,100}\)')
def remove_parentheses(text):
    return re.sub(PARENS_REGEX, '', text)


DESCRIPTION_REGEX = re.compile('\n[a-z A-Z]*:')
def remove_descriptions(text):
    return re.sub(DESCRIPTION_REGEX, '', text)


puncts = [re.escape(c) for c in string.punctuation]
PUNCTUATION_REGEX = re.compile('|'.join(puncts))
def remove_punctuation(text):
    return re.sub(PUNCTUATION_REGEX, '', text)

def normalize(text):
    return unicodedata.normalize('NFKD', text)


In [103]:

text_operations = [
    strip_citations,
    remove_parentheses,
    remove_descriptions,
    replace_newlines,
    split_sentences,
    make_lowercase,
    remove_punctuation,
    normalize
]

cleaned = content
for op in text_operations:
    if type(cleaned) == list:
        cleaned = [op(c) for c in cleaned]
    else:
        cleaned = op(cleaned)
        
print(cleaned)

['python is a highlevel generalpurpose programming language', 'its design philosophy emphasizes code readability with the use of significant indentation via the offside rule', 'python is dynamically typed and garbagecollected', 'it supports multiple programming paradigms including structured  objectoriented and functional programming', 'it is often described as a batteries included language due to its comprehensive standard library', 'guido van rossum began working on python in the late 1980s as a successor to the abc programming language and first released it in 1991 as python 090', 'python 20 was released in 2000', 'python 30 released in 2008 was a major revision not completely backwardcompatible with earlier versions', 'python 2718 released in 2020 was the last release of python 2', 'python consistently ranks as one of the most popular programming languages', 'python was conceived in the late 1980s by guido van rossum at centrum wiskunde  informatica  in the netherlands as a success

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getNgrams(text, n):
    text = text.split(' ')
    return [text[i:i+n] for i in range(len(text)-n+1)]

getNgrams('web scraping with python', 2)

[('web', 'scraping'), ('scraping', 'with'), ('with', 'python')]

In [109]:
from collections import Counter

def getNgrams(text, n):
    text = text.split(' ')
    return [' '.join(text[i:i+n]) for i in range(len(text)-n+1)]

def countNGramsFromSentences(sentences, n):
    counts = Counter()
    for sentence in sentences:
        counts.update(getNgrams(sentence, n))
    return counts

counts = countNGramsFromSentences(cleaned, 2)
print(counts.most_common())

[('in the', 19), ('of the', 19), ('such as', 18), ('as a', 14), ('in python', 12), ('python is', 9), ('of python', 9), ('the python', 9), ('is a', 8), ('to the', 8), ('standard library', 7), ('to python', 7), ('with a', 7), ('programming language', 6), ('programming languages', 6), ('to be', 6), ('written in', 6), ('can be', 6), ('with the', 5), ('it is', 5), ('van rossum', 5), ('for the', 5), ('as the', 5), ('of a', 5), ('the language', 5), ('for example', 5), ('python as', 5), ('be used', 5), ('use of', 4), ('released in', 4), (' in', 4), ('of its', 4), ('python 3', 4), ('to a', 4), ('as of', 4), ('python uses', 4), ('it has', 4), ('contrast to', 4), ('languages such', 4), ('other languages', 4), (' 1', 4), ('operator ', 4), (' ', 4), (' is', 4), ('are not', 4), ('and the', 4), ('in a', 4), ('example the', 4), ('scripting language', 4), ('used in', 4), ('the use', 3), ('functional programming', 3), ('due to', 3), ('guido van', 3), ('python in', 3), ('was released', 3), ('one of', 3),

In [110]:
len(counts)

2814

In [2]:
import re

def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

In [3]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

[['General-purpose', 'programming'], ['programming', 'language'], ['language', 'PythonParadigmMulti-paradigm:'], ['PythonParadigmMulti-paradigm:', 'object-oriented,'], ['object-oriented,', 'procedural'], ['procedural', '(imperative),'], ['(imperative),', 'functional,'], ['functional,', 'structured,'], ['structured,', 'reflectiveDesignedbyGuido'], ['reflectiveDesignedbyGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirstappeared'], ['FoundationFirstappeared', 'February'], ['February', ';'], [';', 'years'], ['years', 'ago('], ['ago(', '-'], ['-', '-'], ['-', ')'], [')', 'Stable'], ['Stable', 'release'], ['release', '.'], ['.', '.'], ['.', '/'], ['/', 'February'], ['February', ';'], [';', 'days'], ['days', 'ago('], ['ago(', 'February'], ['February', ')Preview'], [')Preview', 'release'], ['release', '.'], ['.', '.'], ['.', 'a'], ['a', '/'], ['/', 'February'], ['February', ';'], [';', 'days'], ['days', 'ago('], ['ago(', 'Febr

  content = re.sub('\n|[[\d+\]]', ' ', content)


In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = []
    for sentence in content:
        ngrams.extend(getNgramsFromSentence(sentence, n))
    return(ngrams)
        


In [5]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
print(len(getNgrams(content, 2)))

9414


In [6]:
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

In [7]:
print(getNgrams(content, 2))

Counter({'FROM THE': 209, 'THE ORIGINAL': 199, 'ORIGINAL ON': 197, 'ARCHIVED FROM': 195, 'ON JUNE': 59, 'OF THE': 39, 'SOFTWARE FOUNDATION': 38, 'PYTHON SOFTWARE': 38, 'IN PYTHON': 36, 'OF PYTHON': 34, 'RETRIEVED FEBRUARY': 31, 'IN THE': 26, 'RETRIEVED MARCH': 25, 'THE PYTHON': 23, 'RETRIEVED JANUARY': 23, 'AS A': 21, 'VAN ROSSUM': 21, 'SUCH AS': 21, 'ON MAY': 20, 'RETRIEVED MAY': 19, 'RETRIEVED NOVEMBER': 19, 'IS A': 18, 'RETRIEVED APRIL': 18, 'PROGRAMMING LANGUAGE': 17, 'ON OCTOBER': 17, 'ON DECEMBER': 17, 'RETRIEVED JULY': 16, 'ON APRIL': 16, 'PROGRAMMING LANGUAGES': 15, 'RETRIEVED JUNE': 15, 'TO THE': 14, 'TO BE': 14, 'CAN BE': 14, 'BE USED': 14, 'FOR PYTHON': 14, 'PYTHON ENHANCEMENT': 14, 'RETRIEVED SEPTEMBER': 14, 'PYTHON IS': 13, 'ON MARCH': 13, 'ENHANCEMENT PROPOSALS': 13, 'RETRIEVED DECEMBER': 13, 'ON FEBRUARY': 13, 'WITH THE': 12, 'STANDARD LIBRARY': 12, 'TO PYTHON': 12, 'ON AUGUST': 12, 'ROSSUM GUIDO': 12, 'ON JANUARY': 12, 'IT IS': 11, 'OF A': 11, 'STATEMENT WHICH': 11, 'US