# Flow Control, Data Types and IO

## Python special types 

* ```dict()```
* ```dir()```
* ```len()```
* ```list()```
* ```range()```
* ```reversed()```
* ```set()```
* ```slice()```
* ```sorted()```
* ```tuple()```

## Types and type conversion 
* ```ascii()```
* ```bin()```
* ```bytearray()```
* ```bytes()```
* ```chr()```
* ```complex()```
* ```float()```
* ```format()```
* ```hex()```
* ```int()```
* ```oct()```
* ```ord()```
* ```str()```

## Who knows? 
* ```enumerate()```
* ```getattr()```
* ```globals()```
* ```hasattr()```
* ```iter()```
* ```locals()```
* ```next()```
* ```open()```
* ```print()```
* ```repr()```
* ```setattr()```
* ```type()```
* ```zip()```


## Strings

In [None]:
x = 'Stephen Mildenhall'

In [None]:
x

In [None]:
x[0:7]

In [None]:
x[:7]

In [None]:
x[8:]

In [None]:
x[8:-4]

In [None]:
x[slice(8,-4)]

In [None]:
x[:]

In [None]:
x[::2]

In [None]:
x[::-1]

In [None]:
type(x)

In [None]:
dir(x)

In [None]:
x

In [None]:
y = x + ", St. John's University"
y

In [None]:
y.upper()

## Lists

In [None]:
s = x.split()
s

In [None]:
type(s)

In [None]:
s[0]

In [None]:
s[1]

In [None]:
s2 = list(x)
print(s2)

In [None]:
m = [123]

In [None]:
m

In [None]:
m + [345]

In [None]:
m

In [None]:
m + 345

In [None]:
m.append(346)

In [None]:
m

In [None]:
m = m + [234]
m

In [None]:
m = m + [435, 67]

In [None]:
m

In [None]:
m[4] = 'some'
m

In [None]:
m.append('thing')

In [None]:
m

In [None]:
m.pop()

In [None]:
m.pop(2)

In [None]:
m  # alters m in place 

In [None]:
# iterate over a list 
for i in m:
    print(i)

In [None]:
dir(m)

## List Comprehensions

In [None]:
m = [i*i for i in range(10)]

In [None]:
m

In [None]:
m = [i*i for i in range(10) if i % 3 == 0]

In [None]:
m

In [None]:
m = [[i*j for i in range(5)] for j in range(5)]

In [None]:
m

## Dictionaries 

In [None]:
d = {'first': 'Stephen'}

In [None]:
d

In [None]:
d['first']

In [None]:
d['last'] = 'Mildenhall'
d

In [None]:
d = dict(first='Stephen', last='Mildenhall')
d

In [None]:
d['age'] = 55
d['hair'] = 'brown'

In [None]:
d

In [None]:
for k, v in d.items():
    print(k, v)

In [None]:
d = {i: i*i for i in range(10)}
d

In [None]:
d[5]

## Tuples and Sets and Functions

In [None]:
t = (2, 4)

In [None]:
t

In [None]:
t[1]

In [None]:
t[1] = 5

In [None]:
dir(t)

In [None]:
print([i for i in dir(t) if i[0] != '_'])

In [None]:
def wdid(ob):
    print([i for i in dir(ob) if i[0] != '_'])

In [None]:
wdid(list)

In [None]:
def wdid(ob):
    '''
    wdid(ob)
    
    What does it do? Prints the"normal" methods of an object.
    Arguments:
    ob:    object to query 
    '''
    print([i for i in dir(ob) if i[0] != '_'])

In [None]:
?wdid

In [None]:
wdid(tuple)

In [None]:
wdid(dict)

In [None]:
wdid(wdid)

In [None]:
dir(wdid)

In [None]:
wdid.attribute = 'asdf'

In [None]:
wdid(wdid)

In [None]:
wdid.f = wdid

In [None]:
wdid.f(dict)

In [None]:
wdid(wdid)

In [None]:
s = list('Stephen Mildenhall')
s

In [None]:
set(s)

In [None]:
# count characters in a string with a dictionary 
d = dict()
for c in s:
    if c in d:
        d[c] += 1
    else:
        d[c] = 1
d

In [None]:
def counter(s):
    '''
    count elements of iteratble s
    '''
    d = dict()
    for c in s:
        if c in d:
            d[c] += 1
        else:
            d[c] = 1
    return d

In [None]:
?counter

In [None]:
s = 'count characters in a string with a dictionary'
counter(s)

In [None]:
s.split()

In [None]:
# same function counts words 
counter(s.split())

# Let's do something interesting... 

## Word count for web pages
* Retrieve web page
* Extract text
* Break into words
* Count 

In [None]:
# need some dark arts...
import requests
import bs4

In [None]:
wdid(requests)

In [None]:
# optional pause for something more advanced... 
for m in [ i for i in dir(requests) if i[0] != '_']:
    print(f'\n\n{m}\n{"="*len(m)}\n')
    print(requests.__getattribute__(m).__doc__)

In [None]:
# pip install if not available 

In [None]:
url = 'https://en.wikipedia.org/wiki/Actuary'

In [None]:
r = requests.get(url)

In [None]:
wdid(r)

In [None]:
?r.content

In [None]:
?r.text

In [None]:
r.encoding

In [None]:
txt = r.text

In [None]:
len(txt)

In [None]:
len(txt.split())

In [None]:
counter(txt.split())

In [None]:
# need to tidy up and just get text 
soup = bs4.BeautifulSoup(txt, 'lxml')

In [None]:
ctxt = soup.text
ctxt[:1000]

In [None]:
text[:1000]

In [None]:
counter(text.split())

In [None]:
def get_text_req(r):
    '''
    Tidy up URL response using beautiful soup 
    '''
    tree = bs4.BeautifulSoup(r.text, 'lxml')

    body = tree.body
    if body is None:
        return None

    # two biggest casues of mess are script and style tag elements 
    # delete them 
    for tag in body.select('script'):
        # remove script elements 
        tag.decompose()
        
    for tag in body.select('style'):
        tag.decompose()

    text = body.get_text(separator='\n')
    return text

In [None]:
text = get_text_req(r)

In [None]:
text[:1000]

In [None]:
print(text[:1000])

In [None]:
# more common words from a comprehension 
[(k, v) for k, v in counter(text.split()).items() if v > 5]

In [None]:
# strip out garbage
wdid(str)

In [None]:
# what does isalpha give us? 
''.join(sorted([i  for i in set(text.lower()) if i.isalpha()]))

In [None]:
# what is it omitting? 
''.join(sorted(set([i for i in text.lower() if not i.isalpha()])))

In [None]:
''.join(set([i if i==' ' or i.isalpha() else ' '  for i in text.lower()]))

In [None]:
def super_counter(str_in, min_length=4):
    '''
    super_counter: 
        split str_in into words and count
        only count words >= min_length
        case insensitive 
        strip out unicode characters 
    '''
    # lower case
    str_in = str_in.lower()
    
    # advanced: strip out unicode characters and constrain to letters a-z
    str_in = ''.join([i if i == ' ' or i.isalpha() else ' ' for i in str_in])
    
    # strip to list of words
    low = [w for w in str_in.split(' ') if len(w) >= min_length]
    
    # count, as before 
    dow = dict()
    for w in low:
        if w in dow:
            dow[w] += + 1
        else:
            dow[w] = 1
            
    # return 
    return dow

In [None]:
d = super_counter(text)

In [None]:
# sorted list of most frequent words
fw = [(k, v) for k, v in d.items() if v > 10]
fw

In [None]:
sorted(fw)

In [None]:
?sorted

In [None]:
# lambda functions: on-the-fly functions 
f = lambda x : x * x
f(3)

In [None]:
# quiz: what does this function do when applied to an integer? 
f = lambda x : x * f(x-1) if x else 1
f(6)

In [None]:
# BTW
f(100)

In [None]:
# want to sort on the second element of each tuple: 
sorted(fw, key=lambda x : x[1], reverse=True )

In [None]:
?fw.sort

In [None]:
# enhance original function 
def super_counter(str_in, min_length=4, top=10):
    '''
    super_counter: 
        split str_in into words and count
        only count words >= min_length
        return top words in desc order of frequency 
        case insensitive 
        strip out unicode characters 
    '''
    # lower case
    str_in = str_in.lower()
    
    # advanced: strip out unicode characters and constrain to letters a-z
    str_in = ''.join([i if i == ' ' or i.isalpha() else ' ' for i in str_in])
    
    # strip to list of words
    low = [w for w in str_in.split(' ') if len(w) >= min_length]
    
    # count, as before 
    dow = dict()
    for w in low:
        if w in dow:
            dow[w] += + 1
        else:
            dow[w] = 1
            
    # convert to list for sorting 
    wl =  [(k, v) for k, v in dow.items()]
    
    # sort in place 
    wl.sort(key=lambda x: x[1], reverse=True)
    
    return wl[:top]

In [None]:
super_counter(text, top=50)

In [None]:
# uber function 
def word_count_from_url(url, min_length=4, top=50):
    r = requests.get(url)
    text = get_text_req(r)
    return super_counter(text, min_length, top)

In [None]:
word_count_from_url('https://en.wikipedia.org/wiki/New_York_City')

In [None]:
# uber function 
def wiki_word_count(page_name, min_length=4, top=50):
    r = requests.get('https://en.wikipedia.org/wiki/' + page_name)
    text = get_text_req(r)
    return super_counter(text, min_length, top)

In [None]:
wiki_word_count('Probability')