# Networking
- 'socket' module has low level networking
  - unpleasant to use
  - you will probably never need it, as almost everything is HTTP
- 'urllib' will snarf HTTP content
- 'urllib.urlopen' will return an object similar to a file, with a few extra methods, so can iterate over the lines, or grab entire page as one string

In [1]:
import urllib.request

cu = 'http://columbia.edu'

# returns object that represents the connection - 
# similar to a file descriptor
cur = urllib.request.urlopen(cu)

In [2]:
# headers from the server

list(cur.headers.items())

[('Date', 'Fri, 30 Sep 2016 18:53:32 GMT'),
 ('Server', 'Apache'),
 ('Accept-Ranges', 'bytes'),
 ('Vary', 'Accept-Encoding,User-Agent'),
 ('Transfer-Encoding', 'chunked'),
 ('Content-Type', 'text/html'),
 ('Connection', 'close'),
 ('Set-Cookie',
  'BIGipServer~CUIT~www.columbia.edu-80-pool=1311259520.20480.0000; expires=Sat, 01-Oct-2016 00:53:32 GMT; path=/')]

In [3]:
# headers is a dictionary

cur.headers['Server']

'Apache'

In [6]:
# similar to a file descriptor - 
# the network connection itself is an iterator

cur is iter(cur)

True

In [7]:
# usual iteration protocol reads one line at a time
# note the lines comming back are byte arrays(b'), not strings
# urllib doesn't know or try to guess what encoding is being used
# by the server

[next(cur), next(cur)]

[b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n',
 b'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">\n']

In [8]:
# grab the rest of the lines with 'list'

lines = list(cur)
lines[10:20]

[b'<meta name="viewport" content="maximum-scale=1.0, user-scalable=yes" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="modules/node/node.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="modules/system/defaults.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="modules/system/system.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="modules/system/system-menus.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="modules/user/user.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="sites/all/modules/contrib/cck/theme/content-module.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="sites/all/modules/contrib/ckeditor/ckeditor.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="sites/all/modules/contrib/ctools/css/ctools.css" />\n',
 b'<link type="text/css" rel="stylesheet" media="all" href="sites/all/modules/contrib/date/date.

In [None]:
# or read lines with a for loop

cur = urllib.request.urlopen(cu)

for line in cur:
    print(line)


In [10]:
# ...to get python unicode strings, must decode byte stream
# web sites written in English mostly use utf-8 because it is efficient

lines[10].decode('utf-8')

'<meta name="viewport" content="maximum-scale=1.0, user-scalable=yes" />\n'

# requests
- alternative to urllib
- may be easier for complex tasks

In [11]:
import requests

r = requests.get('http://columbia.edu')
print(r.status_code)
print(r.headers)
print(r.encoding)
# r.text is one string - split it into lines
lines = r.text.split('\n')
lines[10:20]


200
{'Accept-Ranges': 'bytes', 'Server': 'Apache', 'Content-Encoding': 'gzip', 'Connection': 'Keep-Alive', 'Vary': 'Accept-Encoding,User-Agent', 'Content-Type': 'text/html', 'Content-Length': '7360', 'Set-Cookie': 'BIGipServer~CUIT~www.columbia.edu-80-pool=1730689920.20480.0000; expires=Sat, 01-Oct-2016 00:56:48 GMT; path=/', 'Date': 'Fri, 30 Sep 2016 18:56:48 GMT', 'Keep-Alive': 'timeout=15, max=42'}
ISO-8859-1


['<link rel="shortcut icon" href="sites/all/themes/base/columbia2/images/favicon-crown.png" type="image/x-icon" />',
 '<script type="text/javascript" src="sites/all/modules/ias/mdetect/mdetect.js"></script>',
 '<meta name="viewport" content="maximum-scale=1.0, user-scalable=yes" />',
 '<link type="text/css" rel="stylesheet" media="all" href="modules/node/node.css" />',
 '<link type="text/css" rel="stylesheet" media="all" href="modules/system/defaults.css" />',
 '<link type="text/css" rel="stylesheet" media="all" href="modules/system/system.css" />',
 '<link type="text/css" rel="stylesheet" media="all" href="modules/system/system-menus.css" />',
 '<link type="text/css" rel="stylesheet" media="all" href="modules/user/user.css" />',
 '<link type="text/css" rel="stylesheet" media="all" href="sites/all/modules/contrib/cck/theme/content-module.css" />',
 '<link type="text/css" rel="stylesheet" media="all" href="sites/all/modules/contrib/ckeditor/ckeditor.css" />']

In [12]:
type(r.text)

str

# Easy to make a simple web server

In [None]:
# will serve files in the current directory

import http.server
import socketserver

PORT = 8001

Handler = http.server.SimpleHTTPRequestHandler

httpd = socketserver.TCPServer(("", PORT), Handler)

print("serving at port", PORT)
httpd.serve_forever()

# 'Real' python web servers
- two main ones are Flask and Django
- Django [doc](https://www.djangoproject.com)
- Flask [doc](http://flask.pocoo.org)