In [7]:
import re
import string 
from collections import Counter, defaultdict

from urllib.request import urlopen

In [17]:
def replace_newlines(text):
    return text.replace('\n', ' ')

def make_lowercase(text):
    return text.lower()

def split_sentences(text):
    return [s.strip() for s in text.split('. ')]

puncts = [re.escape(c) for c in string.punctuation]
PUNCTUATION_REGEX = re.compile('|'.join(puncts))
def remove_punctuation(text):
    return re.sub(PUNCTUATION_REGEX, '', text)


In [18]:
content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')

text_operations = [
    replace_newlines,
    split_sentences,
    make_lowercase,
    remove_punctuation
]

cleaned = content
for op in text_operations:
    if type(cleaned) == list:
        cleaned = [op(c) for c in cleaned]
    else:
        cleaned = op(cleaned)
        
print(cleaned)



In [28]:
def getNgrams(text, n):
    text = text.split(' ')
    return [' '.join(text[i:i+n]) for i in range(len(text)-n+1)]

def countNGramsFromSentences(sentences, n):
    counts = Counter()
    for sentence in sentences:
        counts.update(getNgrams(sentence, n))
    return counts

counts = countNGramsFromSentences(cleaned, 2)
print(counts.most_common())



In [30]:
COMMON_WORDS = ['the', 'be', 'and', 'of', 'a', 'in', 'to', 'have', 'it', 'i', 'that', 'for', 'you', 'he', 'with', 'on', 'do', 'say', 'this', 'they', 'is', 'an', 'at', 'but', 'we', 'his', 'from', 'that', 'not', 'by', 'she', 'or', 'as', 'what', 'go', 'their', 'can', 'who', 'get', 'if', 'would', 'her', 'all', 'my', 'make', 'about', 'know', 'will', 'as', 'up', 'one', 'time', 'has', 'been', 'there', 'year', 'so', 'think', 'when', 'which', 'them', 'some', 'me', 'people', 'take', 'out', 'into', 'just', 'see', 'him', 'your', 'come', 'could', 'now', 'than', 'like', 'other', 'how', 'then', 'its', 'our', 'two', 'more', 'these', 'want', 'way', 'look', 'first', 'also', 'new', 'because', 'day', 'more', 'use', 'no', 'man', 'find', 'here', 'thing', 'give', 'many', 'well']

def isCommon(ngram):
    return any([w in COMMON_WORDS for w in ngram.split(' ')])

def filterCommon(counts):
    return Counter({key: val for key, val in counts.items() if not isCommon(key)})

filterCommon(counts).most_common()


[('united states', 10),
 ('executive department', 4),
 ('general government', 4),
 ('called upon', 3),
 ('chief magistrate', 3),
 ('legislative body', 3),
 ('same causes', 3),
 ('government should', 3),
 ('whole country', 3),
 ('was observable', 2),
 ('express grant', 2),
 ('several departments', 2),
 ('american citizen', 2),
 ('was intended', 2),
 ('are attributable', 2),
 ('upon another', 2),
 ('reserved rights', 2),
 ('federal government', 2),
 ('increase itself', 2),
 ('were made', 2),
 ('heretofore given', 2),
 ('pristine health', 2),
 ('second term', 2),
 ('observed however', 2),
 ('foreign relations', 2),
 ('executive power', 2),
 ('negative upon', 2),
 ('declare void', 2),
 ('both houses', 2),
 ('used only', 2),
 ('state governments', 2),
 ('immediate representatives', 2),
 ('veto power', 2),
 ('high office', 2),
 ('disputed points', 2),
 ('respectively claim', 2),
 ('reserved powers', 2),
 ('state authorities', 2),
 ('great increase', 2),
 ('elective franchise', 2),
 ('strange

In [4]:
def getFirstSentenceContaining(ngram, content):
    #print(ngram)
    sentences = content.lower().split(". ")
    for sentence in sentences: 
        if ngram in sentence:
            return sentence+'\n'
    return ""

content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')


print(getFirstSentenceContaining('united states', content))
print(getFirstSentenceContaining('executive department', content))
print(getFirstSentenceContaining('general government', content))
print(getFirstSentenceContaining('called upon', content))
print(getFirstSentenceContaining('chief magistrate', content))

the constitution of the united states is the instrument containing this grant of power to the several departments composing the government

such a one was afforded by the executive department constituted by the constitution

the general government has seized upon none of the reserved rights of the states

called from a retirement which i had supposed was to continue for the residue of my life to fill the chief executive office of this great and free nation, i appear before you, fellow-citizens, to take the oaths which the constitution prescribes as a necessary qualification for the performance of its duties; and in obedience to a custom coeval with our government and what i believe to be your expectations i proceed to present to you a summary of the principles which will govern me in the discharge of the duties which i shall be called upon to perform.

it was the remark of a roman consul in an early period of that celebrated republic that a most striking contrast was observable in the 

In [14]:
from urllib.request import urlopen
from random import randint
from collections import defaultdict


def retrieveRandomWord(wordList):
    randIndex = randint(1, sum(wordList.values()))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word

def cleanAndSplitText(text):
    # Remove newlines and quotes
    text = text.replace('\n', ' ').replace('"', '');

    # Make sure punctuation marks are treated as their own "words,"
    # so that they will be included in the Markov chain
    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol, f' {symbol} ');
    # Filter out empty words
    return [word for word in text.split(' ') if word != '']
     
def buildWordDict(text):
    words = cleanAndSplitText(text)
    wordDict = defaultdict(dict)
    for i in range(1, len(words)):
        wordDict[words[i-1]][words[i]] = wordDict[words[i-1]].get(words[i], 0) + 1
    return wordDict

text = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt')
          .read(), 'utf-8')
wordDict = buildWordDict(text)

#Generate a Markov chain of length 100
length = 100
chain = ['I']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)

print(' '.join(chain))

I conceive , for a privilege which has never their own discretion and of a misconstruction of the care of its departments composing it immediately checked . The influence it would usurp the States may receive . I am not only to encourage them would be used only result to be effected public revenues , and , and prejudices , living with his considering such nation , of men are as well understand the operations of the Executive . He claims them , possess a thought could then , however , supported by his power to the instrument could have placed


In [2]:
import pymysql

conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='root', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE wikipedia')

def getUrl(pageId):
    cur.execute('SELECT url FROM pages WHERE id = %s', (int(pageId)))
    return cur.fetchone()[0]

def getLinks(fromPageId):
    cur.execute('SELECT toPageId FROM links WHERE fromPageId = %s', (int(fromPageId)))
    if cur.rowcount == 0:
        return []
    return [x[0] for x in cur.fetchall()]

def searchBreadth(targetPageId, paths=[[1]]):
    newPaths = []
    for path in paths:
        links = getLinks(path[-1])
        for link in links:
            if link == targetPageId:
                return path + [link]
            else:
                newPaths.append(path+[link])
    return searchBreadth(targetPageId, newPaths)
                
nodes = getLinks(1)
targetPageId = 28624
pageIds = searchBreadth(targetPageId)
for pageId in pageIds:
    print(getUrl(pageId))

/wiki/Kevin_Bacon
/wiki/Primetime_Emmy_Award_for_Outstanding_Lead_Actor_in_a_Miniseries_or_a_Movie
/wiki/Gary_Gilmore
/wiki/Eric_Idle


## Natural Language Toolkit

In [1]:
import nltk

In [28]:
# If you run into an error with nltk not "punkt," try this
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/RSpecht/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from nltk import word_tokenize
from nltk import Text

tokens = word_tokenize('Here is some not very interesting text')
text = Text(tokens)

In [11]:
nltk.download('book')

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/RSpecht/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/RSpecht/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/RSpecht/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/RSpecht/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/RSpecht/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/RSpecht/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nlt

True

In [12]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [14]:
len(text6)/len(set(text6))

7.833333333333333

In [17]:
from nltk import FreqDist
fdist = FreqDist(text6)
fdist.most_common(10)


[(':', 1197),
 ('.', 816),
 ('!', 801),
 (',', 731),
 ("'", 421),
 ('[', 319),
 (']', 312),
 ('the', 299),
 ('I', 255),
 ('ARTHUR', 225)]

In [18]:
from nltk import bigrams

bigrams = bigrams(text6)
bigramsDist = FreqDist(bigrams)
bigramsDist[('Sir', 'Robin')]

18

In [19]:
from nltk import ngrams

fourgrams = ngrams(text6, 4)
fourgramsDist = FreqDist(fourgrams)
fourgramsDist[('father', 'smelt', 'of', 'elderberries')]

1

In [42]:
from nltk.book import *
from nltk import ngrams

fourgrams = ngrams(text6, 4)

[f for f in fourgrams if f[0] == 'coconut']


[('coconut', 'and', 'you', "'"),
 ('coconut', "'", 's', 'tropical'),
 ('coconut', '?', 'ARTHUR', ':'),
 ('coconut', '.', 'ARTHUR', ':'),
 ('coconut', 'back', 'anyway', '...'),
 ('coconut', 'on', 'a', 'line')]

In [22]:
from nltk.book import *
from nltk import word_tokenize
from nltk import pos_tag

text = word_tokenize('Strange women lying in ponds distributing swords is no basis for a system of government.')

pos_tag(text)

[('Strange', 'JJ'),
 ('women', 'NNS'),
 ('lying', 'VBG'),
 ('in', 'IN'),
 ('ponds', 'NNS'),
 ('distributing', 'VBG'),
 ('swords', 'NNS'),
 ('is', 'VBZ'),
 ('no', 'DT'),
 ('basis', 'NN'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('system', 'NN'),
 ('of', 'IN'),
 ('government', 'NN'),
 ('.', '.')]

In [32]:
text = word_tokenize('The dust was thick so he had to dust')
pos_tag(text)

[('The', 'DT'),
 ('dust', 'NN'),
 ('was', 'VBD'),
 ('thick', 'RB'),
 ('so', 'RB'),
 ('he', 'PRP'),
 ('had', 'VBD'),
 ('to', 'TO'),
 ('dust', 'VB')]

In [31]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/RSpecht/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
sentence = 'Google is one of the best companies in the world.'
tokenized = word_tokenize(sentence)
pos_tag(tokenized)

[('Google', 'NNP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('companies', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('.', '.')]

In [39]:
from nltk import word_tokenize, sent_tokenize, pos_tag
sentences = [
    'Google is one of the best companies in the world.',
    ' I constantly google myself to see what I\'m up to.'
]
nouns = ['NN', 'NNS', 'NNP', 'NNPS']

for sentence in sentences:
    for word, tag in pos_tag(word_tokenize(sentence)):
        if word.lower() == 'google' and tag in nouns:
            print(sentence)


Google is one of the best companies in the world.
