# Flow Control, Data Types and IO

## Python special types 

* ```dict()```
* ```dir()```
* ```len()```
* ```list()```
* ```range()```
* ```reversed()```
* ```set()```
* ```slice()```
* ```sorted()```
* ```tuple()```

## Types and type conversion 
* ```ascii()```
* ```bin()```
* ```bytearray()```
* ```bytes()```
* ```chr()```
* ```complex()```
* ```float()```
* ```format()```
* ```hex()```
* ```int()```
* ```oct()```
* ```ord()```
* ```str()```

## Who knows? 
* ```enumerate()```
* ```getattr()```
* ```globals()```
* ```hasattr()```
* ```iter()```
* ```locals()```
* ```next()```
* ```open()```
* ```print()```
* ```repr()```
* ```setattr()```
* ```type()```
* ```zip()```


## Strings

In [1]:
x = 'Stephen Mildenhall'

In [2]:
x

'Stephen Mildenhall'

In [4]:
x[0:7]

'Stephen'

In [5]:
x[:7]

'Stephen'

In [7]:
x[8:]

'Mildenhall'

In [8]:
x[8:-4]

'Milden'

In [9]:
x[slice(8,-4)]

'Milden'

In [10]:
x[:]

'Stephen Mildenhall'

In [14]:
x[::2]

'SehnMlehl'

In [15]:
x[::-1]

'llahnedliM nehpetS'

In [16]:
type(x)

str

In [17]:
dir(x)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

In [192]:
x

'Stephen Mildenhall'

In [195]:
y = x + ", St. John's University"
y

"Stephen Mildenhall, St. John's University"

In [197]:
y.upper()

"STEPHEN MILDENHALL, ST. JOHN'S UNIVERSITY"

## Lists

In [19]:
s = x.split()
s

['Stephen', 'Mildenhall']

In [20]:
type(s)

list

In [21]:
s[0]

'Stephen'

In [22]:
s[1]

'Mildenhall'

In [34]:
s2 = list(x)
print(s2)

['S', 't', 'e', 'p', 'h', 'e', 'n', ' ', 'M', 'i', 'l', 'd', 'e', 'n', 'h', 'a', 'l', 'l']


"['S', 't', 'e', 'p', 'h', 'e', 'n', ' ', 'M', 'i', 'l', 'd', 'e', 'n', 'h', 'a', 'l', 'l']"

In [53]:
m = [123]

In [54]:
m

[123]

In [55]:
m + [345]

[123, 345]

In [56]:
m = m + [234]
m

[123, 234]

In [57]:
m = m + [435, 67]

In [58]:
m

[123, 234, 435, 67]

In [59]:
m[4] = 'some'

IndexError: list assignment index out of range

In [60]:
m.append('some')

In [61]:
m

[123, 234, 435, 67, 'some']

In [62]:
m.pop()

'some'

In [63]:
m.pop(2)

435

In [64]:
m  # alters m in place 

[123, 234, 67]

In [66]:
# iterate over a list 
for i in m:
    print(i)

123
234
67


In [67]:
dir(m)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

## List Comprehensions

In [68]:
m = [i*i for i in range(10)]

In [69]:
m

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [71]:
m = [i*i for i in range(10) if i % 3 == 0]

In [72]:
m

[0, 9, 36, 81]

In [73]:
m = [[i*j for i in range(5)] for j in range(5)]

In [74]:
m

[[0, 0, 0, 0, 0],
 [0, 1, 2, 3, 4],
 [0, 2, 4, 6, 8],
 [0, 3, 6, 9, 12],
 [0, 4, 8, 12, 16]]

## Dictionaries 

In [75]:
d = {'first': 'Stephen'}

In [76]:
d

{'first': 'Stephen'}

In [79]:
d['first']

'Stephen'

In [77]:
d = dict(first='Stephen', last='Mildenhall')
d

{'first': 'Stephen', 'last': 'Mildenhall'}

In [80]:
d['age'] = 55
d['hair'] = 'brown'

In [81]:
d

{'first': 'Stephen', 'last': 'Mildenhall', 'age': 55, 'hair': 'brown'}

In [83]:
for k, v in d.items():
    print(k, v)

first Stephen
last Mildenhall
age 55
hair brown


In [84]:
d = {i: i*i for i in range(10)}
d

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64, 9: 81}

In [85]:
d[5]

25

## Tuples and Sets and Functions

In [86]:
t = (2, 4)

In [87]:
t

(2, 4)

In [88]:
t[1]

4

In [89]:
t[1] = 5

TypeError: 'tuple' object does not support item assignment

In [90]:
dir(t)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'count',
 'index']

In [91]:
print([i for i in dir(t) if i[0] != '_'])

['count', 'index']


In [113]:
def wdid(ob):
    print([i for i in dir(ob) if i[0] != '_'])

In [114]:
wdid(list)

['append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']


In [115]:
def wdid(ob):
    '''
    wdid(ob)
    
    What does it do? Prints the"normal" methods of an object.
    Arguments:
    ob:    object to query 
    '''
    print([i for i in dir(ob) if i[0] != '_'])

In [116]:
?wdid

[1;31mSignature:[0m [0mwdid[0m[1;33m([0m[0mob[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
wdid(ob)

What does it do? Prints the"normal" methods of an object.
Arguments:
ob:    object to query 
[1;31mFile:[0m      c:\s\telos\python\temp\paw_rpm\notebooks\<ipython-input-115-e90194209ea3>
[1;31mType:[0m      function


In [117]:
wdid(tuple)

['count', 'index']


In [118]:
wdid(dict)

['clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']


In [123]:
wdid(wdid)

[]


In [124]:
dir(wdid)

['__annotations__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [125]:
wdid.attribute = 'asdf'

In [129]:
wdid(wdid)

['attribute', 'f']


In [127]:
wdid.f = wdid

In [128]:
wdid.f(dict)

['clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']


In [132]:
wdid(wdid)

['attribute', 'f']


In [97]:
s = list('Stephen Mildenhall')
s

['S',
 't',
 'e',
 'p',
 'h',
 'e',
 'n',
 ' ',
 'M',
 'i',
 'l',
 'd',
 'e',
 'n',
 'h',
 'a',
 'l',
 'l']

In [98]:
set(s)

{' ', 'M', 'S', 'a', 'd', 'e', 'h', 'i', 'l', 'n', 'p', 't'}

In [100]:
# count characters in a string with a dictionary 
d = dict()
for c in s:
    if c in d:
        d[c] += 1
    else:
        d[c] = 1
d

{'S': 1,
 't': 1,
 'e': 3,
 'p': 1,
 'h': 2,
 'n': 2,
 ' ': 1,
 'M': 1,
 'i': 1,
 'l': 3,
 'd': 1,
 'a': 1}

In [121]:
def counter(s):
    '''
    count elements of iteratble s
    '''
    d = dict()
    for c in s:
        if c in d:
            d[c] += 1
        else:
            d[c] = 1
    return d

In [122]:
?counter

[1;31mSignature:[0m [0mcounter[0m[1;33m([0m[0ms[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m count elements of iteratble s
[1;31mFile:[0m      c:\s\telos\python\temp\paw_rpm\notebooks\<ipython-input-121-c545cc6a2585>
[1;31mType:[0m      function


In [105]:
s = 'count characters in a string with a dictionary'
counter(s)

{'c': 4,
 'o': 2,
 'u': 1,
 'n': 4,
 't': 5,
 ' ': 8,
 'h': 2,
 'a': 5,
 'r': 4,
 'e': 1,
 's': 2,
 'i': 5,
 'g': 1,
 'w': 1,
 'd': 1,
 'y': 1}

In [106]:
s.split()

['count', 'characters', 'in', 'a', 'string', 'with', 'a', 'dictionary']

In [107]:
# same function counts words 
counter(s.split())

{'count': 1,
 'characters': 1,
 'in': 1,
 'a': 2,
 'string': 1,
 'with': 1,
 'dictionary': 1}

# Let's do something interesting... 

## Word count for web pages
* Retrieve web page
* Extract text
* Break into words
* Count 

In [133]:
def list_of_words(str_in, min_length=4):
    '''
    Parse str_in into list of unique words of length >= min_length
    :param str_in:
    :param min_length:
    :return:
    '''


In [None]:
def save_words_from_url(url, fn):
    '''
    Download URL and parse text into list of words

    :param url:
    :return:
    '''
    data = requests.get(url).content
    code = data.decode('utf-8')
    soup = bs4.BeautifulSoup(code, 'lxml')
    text = soup.text
    with open(fn , 'w', encoding='utf-8') as f:
        f.write(text)
    word_dict = list_of_words(soup.text)
    D = pd.DataFrame.from_dict(word_dict, orient='index')
    D.columns = ['Freq']
    D.sort_values('Freq', inplace=True, ascending=False)
    return D

In [163]:
import requests
import bs4
import lxml

In [141]:
# pip install if not available 

In [142]:
url = 'https://en.wikipedia.org/wiki/Actuary'

In [143]:
r = requests.get(url)

In [144]:
wdid(r)

['apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'is_permanent_redirect', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'next', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']


In [150]:
?r.content

[1;31mType:[0m        property
[1;31mString form:[0m <property object at 0x0000029A70CBE638>
[1;31mDocstring:[0m   Content of the response, in bytes.


In [151]:
?r.text

[1;31mType:[0m        property
[1;31mString form:[0m <property object at 0x0000029A70CBE688>
[1;31mDocstring:[0m  
Content of the response, in unicode.

If Response.encoding is None, encoding will be guessed using
``chardet``.

The encoding of the response content is determined based solely on HTTP
headers, following RFC 2616 to the letter. If you can take advantage of
non-HTTP knowledge to make a better guess at the encoding, you should
set ``r.encoding`` appropriately before accessing this property.


In [152]:
r.encoding

'UTF-8'

In [153]:
txt = r.text

In [154]:
len(txt)

172592

In [155]:
len(txt.split())

9705

In [None]:
counter(txt.split())

In [157]:
# need to tidy up and just get text 
soup = bs4.BeautifulSoup(txt, 'lxml')

In [158]:
ctxt = soup.text
ctxt[:1000]

'\n\n\nActuary - Wikipedia\ndocument.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );\n(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Actuary","wgTitle":"Actuary","wgCurRevisionId":883988464,"wgRevisionId":883988464,"wgArticleId":43405,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Wikipedia indefinitely move-protected pages","Use mdy dates from January 2014","Use Harvard referencing from August 2014","Articles containing potentially dated statements from 2016","All articles containing potentially dated statements","Articles containing potentially dated statements from 2014","CS1 maint: BOT: original-url status unknown","Featured articles","Actuarial science","Mathematical science occupations","Financial servi

In [160]:
text[:1000]

'\n\n\nActuary - Wikipedia\ndocument.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );\n(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Actuary","wgTitle":"Actuary","wgCurRevisionId":883988464,"wgRevisionId":883988464,"wgArticleId":43405,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Wikipedia indefinitely move-protected pages","Use mdy dates from January 2014","Use Harvard referencing from August 2014","Articles containing potentially dated statements from 2016","All articles containing potentially dated statements","Articles containing potentially dated statements from 2014","CS1 maint: BOT: original-url status unknown","Featured articles","Actuarial science","Mathematical science occupations","Financial servi

In [None]:
counter(text.split())

In [167]:
def get_text_req(r):
    tree = bs4.BeautifulSoup(r.text, 'lxml')

    body = tree.body
    if body is None:
        return None

    for tag in body.select('script'):
        # remove script elements 
        tag.decompose()
    for tag in body.select('style'):
        tag.decompose()

    text = body.get_text(separator='\n')
    return text

In [169]:
text = get_text_req(r)

In [170]:
text[:1000]

' \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nActuary\n \n\n\nFrom Wikipedia, the free encyclopedia\n \n\n\n \nJump to navigation\n\n\nJump to search\n\n\nBusiness professional who deals with the financial impact of risk and uncertainty\n\n\n\n\n\n\nActuary\nDamage from \nHurricane Katrina\n in 2005. Actuaries need to estimate long-term levels of such damage in order to accurately price property insurance, set appropriate \nreserves\n, and design appropriate \nreinsurance\n and capital management strategies.\nOccupation\nNames\nActuary\nOccupation type\nProfession\nActivity sectors\nInsurance\n, \nReinsurance\n, \nPension plans\n, \nSocial welfare programs\nDescription\nCompetencies\nMathematics\n, \nfinance\n, analytical skills, business knowledge\nEducation required\nSee \nCredentialing and exams\nFields of\nemployment\nInsurance companies, superannuation funds, consulting firms and government\nRelated jobs\nUnderwriter\n\n\nAn \nactuary\n is a business professional who deals with the measurement 

In [None]:
[(k, v) for k, v in counter(text.split()).items() if v > 5]

In [173]:
def super_counter(str_in, min_length=4):
    '''
    super_counter: 
        split str_in into words and count
        only count words >= min_length
        case insensitive 
        strip out unicode characters 
    '''
    # lower case
    str_in = str_in.lower()
    
    # advanced: strip out unicode characters and constrain to letters a-z
    str_in = ''.join([i if i == ' ' or (len(i.encode('utf-8')) == 1 and i >= 'a' and i <= 'z')
                      else ' ' for i in str_in])
    
    # strip to list of words
    low = [w for w in str_in.split(' ') if len(w) >= min_length]
    
    # count, as before 
    dow = dict()
    for w in low:
        if w in dow:
            dow[w] += + 1
        else:
            dow[w] = 1
            
    # return 
    return dow

In [177]:
d = super_counter(text)

In [179]:
# sorted list of most frequent words
fw = [(k, v) for k, v in d.items() if v > 20]
fw

[('actuary', 55),
 ('from', 31),
 ('financial', 26),
 ('risk', 25),
 ('actuaries', 81),
 ('insurance', 43),
 ('actuarial', 53),
 ('their', 33),
 ('that', 31),
 ('life', 25),
 ('society', 29),
 ('retrieved', 68),
 ('april', 21)]

In [180]:
sorted(fw)

[('actuarial', 53),
 ('actuaries', 81),
 ('actuary', 55),
 ('april', 21),
 ('financial', 26),
 ('from', 31),
 ('insurance', 43),
 ('life', 25),
 ('retrieved', 68),
 ('risk', 25),
 ('society', 29),
 ('that', 31),
 ('their', 33)]

In [181]:
?sorted

[1;31mSignature:[0m [0msorted[0m[1;33m([0m[0miterable[0m[1;33m,[0m [1;33m/[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mreverse[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return a new list containing all items from the iterable in ascending order.

A custom key function can be supplied to customize the sort order, and the
reverse flag can be set to request the result in descending order.
[1;31mType:[0m      builtin_function_or_method


In [182]:
sorted(fw, key=lambda x : x[1], reverse=True )

[('actuaries', 81),
 ('retrieved', 68),
 ('actuary', 55),
 ('actuarial', 53),
 ('insurance', 43),
 ('their', 33),
 ('from', 31),
 ('that', 31),
 ('society', 29),
 ('financial', 26),
 ('risk', 25),
 ('life', 25),
 ('april', 21)]

In [183]:
?fw.sort

[1;31mDocstring:[0m L.sort(key=None, reverse=False) -> None -- stable sort *IN PLACE*
[1;31mType:[0m      builtin_function_or_method


In [186]:
# enhance original function 
def super_counter(str_in, min_length=4, top=10):
    '''
    super_counter: 
        split str_in into words and count
        only count words >= min_length
        return top words in desc order of frequency 
        case insensitive 
        strip out unicode characters 
    '''
    # lower case
    str_in = str_in.lower()
    
    # advanced: strip out unicode characters and constrain to letters a-z
    str_in = ''.join([i if i == ' ' or (len(i.encode('utf-8')) == 1 and i >= 'a' and i <= 'z')
                      else ' ' for i in str_in])
    
    # strip to list of words
    low = [w for w in str_in.split(' ') if len(w) >= min_length]
    
    # count, as before 
    dow = dict()
    for w in low:
        if w in dow:
            dow[w] += + 1
        else:
            dow[w] = 1
            
    # convert to list for sorting 
    wl =  [(k, v) for k, v in dow.items()]
    
    # sort in place 
    wl.sort(key=lambda x: x[1], reverse=True)
    
    return wl[:top]

In [187]:
super_counter(text)

[('actuaries', 81),
 ('retrieved', 68),
 ('actuary', 55),
 ('actuarial', 53),
 ('insurance', 43),
 ('their', 33),
 ('from', 31),
 ('that', 31),
 ('society', 29),
 ('financial', 26)]

In [188]:
# uber function 
def word_count_from_url(url, min_length=4, top=50):
    r = requests.get(url)
    text = get_text_req(r)
    return super_counter(text, min_length, top)

In [190]:
word_count_from_url('https://en.wikipedia.org/wiki/New_York_City')

[('york', 950),
 ('city', 722),
 ('retrieved', 453),
 ('from', 227),
 ('world', 186),
 ('united', 168),
 ('states', 155),
 ('with', 149),
 ('manhattan', 148),
 ('july', 131),
 ('park', 129),
 ('largest', 121),
 ('original', 119),
 ('island', 117),
 ('archived', 116),
 ('population', 102),
 ('times', 100),
 ('most', 98),
 ('february', 98),
 ('october', 92),
 ('august', 90),
 ('center', 89),
 ('american', 89),
 ('september', 87),
 ('brooklyn', 82),
 ('state', 80),
 ('area', 78),
 ('june', 75),
 ('march', 75),
 ('national', 72),
 ('cities', 68),
 ('queens', 66),
 ('census', 64),
 ('april', 64),
 ('which', 63),
 ('also', 62),
 ('than', 61),
 ('county', 60),
 ('north', 59),
 ('that', 57),
 ('university', 56),
 ('bridge', 55),
 ('metropolitan', 55),
 ('public', 55),
 ('river', 55),
 ('square', 54),
 ('bronx', 53),
 ('january', 53),
 ('department', 53),
 ('central', 52)]