In [11]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getNgrams(content, n):
  content = content.split(' ')
  output = []
  for i in range(len(content)-n+1):
    output.append(content[i:i+n])
  return output

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

[['\nPython\n\n\n\n\nParadigm\nObject-oriented,', 'imperative,'], ['imperative,', 'functional,'], ['functional,', 'procedural,'], ['procedural,', 'reflective\n\n\nDesigned\xa0by\nGuido'], ['reflective\n\n\nDesigned\xa0by\nGuido', 'van'], ['van', 'Rossum\n\n\nDeveloper\nPython'], ['Rossum\n\n\nDeveloper\nPython', 'Software'], ['Software', 'Foundation\n\n\nFirst\xa0appeared\n20\xa0February'], ['Foundation\n\n\nFirst\xa0appeared\n20\xa0February', '1991;'], ['1991;', '26'], ['26', 'years'], ['years', 'ago\xa0(1991-02-20)[1]\n\n\n\n\n\nStable'], ['ago\xa0(1991-02-20)[1]\n\n\n\n\n\nStable', 'release\n\n3.6.4'], ['release\n\n3.6.4', '/'], ['/', '19\xa0December'], ['19\xa0December', '2017;'], ['2017;', '18'], ['18', 'days'], ['days', 'ago\xa0(2017-12-19)[2]\n2.7.14'], ['ago\xa0(2017-12-19)[2]\n2.7.14', '/'], ['/', '16\xa0September'], ['16\xa0September', '2017;'], ['2017;', '3'], ['3', 'months'], ['months', 'ago\xa0(2017-09-16)[3]\n\n\n\n\n\n\nTyping'], ['ago\xa0(2017-09-16)[3]\n\n\n\n\n\n\nTyp

In [18]:
import re

def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

In [19]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

[['Python', 'Paradigm'], ['Paradigm', 'Object-oriented,'], ['Object-oriented,', 'imperative,'], ['imperative,', 'functional,'], ['functional,', 'procedural,'], ['procedural,', 'reflective'], ['reflective', 'Designedby'], ['Designedby', 'Guido'], ['Guido', 'van'], ['van', 'Rossum'], ['Rossum', 'Developer'], ['Developer', 'Python'], ['Python', 'Software'], ['Software', 'Foundation'], ['Foundation', 'Firstappeared'], ['Firstappeared', 'February'], ['February', ';'], [';', 'years'], ['years', 'ago('], ['ago(', '-'], ['-', '-'], ['-', ')'], [')', 'Stable'], ['Stable', 'release'], ['release', '.'], ['.', '.'], ['.', '/'], ['/', 'December'], ['December', ';'], [';', 'days'], ['days', 'ago('], ['ago(', '-'], ['-', '-'], ['-', ')'], [')', '.'], ['.', '.'], ['.', '/'], ['/', 'September'], ['September', ';'], [';', 'months'], ['months', 'ago('], ['ago(', '-'], ['-', '-'], ['-', ')'], [')', 'Typing'], ['Typing', 'discipline'], ['discipline', 'Duck,'], ['Duck,', 'dynamic,'], ['dynamic,', 'strong'],

In [70]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = []
    for sentence in content:
        ngrams.extend(getNgramsFromSentence(sentence, n))
    return(ngrams)
        


In [71]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
print(len(getNgrams(content, 2)))

7275


In [68]:
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

In [69]:
print(getNgrams(content, 2))

5479
7275
Counter({'PYTHON SOFTWARE': 40, 'SOFTWARE FOUNDATION': 37, 'OF THE': 34, 'IN PYTHON': 31, 'OF PYTHON': 28, 'IN THE': 24, 'THE PYTHON': 24, 'VAN ROSSUM': 23, 'TO THE': 20, 'SUCH AS': 19, 'RETRIEVED FEBRUARY': 19, 'IS A': 17, 'FROM THE': 16, 'PYTHON ENHANCEMENT': 15, 'PYTHON IS': 14, 'AS A': 14, 'ENHANCEMENT PROPOSALS': 14, 'IT IS': 13, 'RETRIEVED MARCH': 13, 'PROGRAMMING LANGUAGE': 12, 'RETRIEVED DECEMBER': 12, 'ROSSUM GUIDO': 12, 'FOR EXAMPLE': 11, 'CAN BE': 11, 'BE USED': 11, 'RETRIEVED SEPTEMBER': 11, 'RETRIEVED NOVEMBER': 11, 'RETRIEVED JANUARY': 11, 'PYTHON HAS': 10, 'STANDARD LIBRARY': 10, 'ON THE': 10, 'FOR THE': 10, 'PROGRAMMING LANGUAGES': 10, 'FROM PYTHON': 10, 'RETRIEVED AUGUST': 10, 'GUIDO VAN': 9, 'AND A': 9, 'BY THE': 9, 'OTHER LANGUAGES': 9, 'TO BE': 9, 'PYTHON AND': 9, 'STATEMENT WHICH': 9, 'RETRIEVED JUNE': 9, 'OF ITS': 8, 'THE LANGUAGE': 8, 'TO PYTHON': 8, 'A PYTHON': 8, 'OF A': 8, 'USED TO': 8, 'COMPARISON OF': 8, 'FOR PYTHON': 8, 'ARCHIVED FROM': 8, 'THE OR