# Processing Raw Text #

## Accessing Text from the Web and from Disk##

Text number 2554 is an English translation of Crime and Punishment, and we can access it as follows.

In [4]:
from __future__ import division  # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize

In [1]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)

str

In [2]:
len(raw)

1176967

In [3]:
raw[:75]

'\ufeffThe Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r'

##  tokenization ##

In [5]:
tokens = word_tokenize(raw)
type(tokens)

list

In [6]:
len(tokens)

257727

In [7]:
tokens[:10]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

In [9]:
text = nltk.Text(tokens)
text

<Text: ﻿The Project Gutenberg EBook of Crime and Punishment...>

In [10]:
text[1024:1062]

['an',
 'exceptionally',
 'hot',
 'evening',
 'early',
 'in',
 'July',
 'a',
 'young',
 'man',
 'came',
 'out',
 'of',
 'the',
 'garret',
 'in',
 'which',
 'he',
 'lodged',
 'in',
 'S.',
 'Place',
 'and',
 'walked',
 'slowly',
 ',',
 'as',
 'though',
 'in',
 'hesitation',
 ',',
 'towards',
 'K.',
 'bridge',
 '.',
 'He',
 'had',
 'successfully']

to discover unique strings that mark the beginning and the end, before trimming raw to be just the content and nothing else:

In [12]:
raw.find("PART I")

5336

In [21]:
raw.find("end of Project Gutenberg's Crime")
#when it finds the right value

-1

In [14]:
raw = raw[5338:1157743]
#choose the index between part 1 and end

In [15]:
raw.find("PART I")
#part I should be found in the beginning as 0 

195769

## Dealing with HTML ##

BBC News story called Blondes to die out in 200 years

In [22]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [None]:
print(html)

In [None]:
#To get text out of HTML we will use a Python library called BeautifulSoup#
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
tokens

In [25]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


## Processing RSS Feeds ##

In [None]:
import feedparser
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']
#'Language Log'
len(llog.entries)
#15
post = llog.entries[2]
post.title
#"He's My BF"
content = post.content[0].value
content[:70]
#'<p>Today I was chatting with three of our visiting graduate students f'
raw = BeautifulSoup(content, 'html.parser').get_text()
word_tokenize(raw)
#['Today', 'I', 'was', 'chatting', 'with', 'three', 'of', 'our', 'visiting',
#'graduate', 'students', 'from', 'the', 'PRC', '.', 'Thinking', 'that', 'I',
#'was', 'being', 'au', 'courant', ',', 'I', 'mentioned', 'the', 'expression',
#'DUI4XIANG4', '\u5c0d\u8c61', '("', 'boy', '/', 'girl', 'friend', '"', ...]

## Reading Local Files ##

In [None]:
import os
os.listdir('.')
f = open('document.txt')
raw = f.read()
for line in f:
    print(line.strip())

NLTK's corpus files can also be accessed using these methods. We simply have to use nltk.data.find() to get the filename for any corpus item. Then we can open and read it in the way we just demonstrated above:

In [None]:
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()

## Extracting encoded text from files##

In [None]:
f = open(path, encoding='latin2')
for line in f:
    line = line.strip()
    print(line.encode('unicode_escape'))
#b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
#b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
#b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
#b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
#b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
#b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'

In [28]:
nacute = '\u0144'
nacute

'ń'

In [29]:
nacute.encode('utf8')

b'\xc5\x84'

## Regular Expressions for Detecting Word Patterns ##

In [31]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

### Using Basic Meta-Characters ###

In [None]:
[w for w in wordlist if re.search('ed$', w)]
#['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', ...]

The . wildcard symbol matches any single character. Suppose we have room in a crossword puzzle for an 8-letter word with j as its third letter and t as its sixth letter. In place of each blank cell we use a period:

In [None]:
[w for w in wordlist if re.search('^..j..t..$', w)]
#['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', ...]