In [4]:
# %load pandas_geoplotter_examples.py
#############
# Lets play with Pandas a little
#############

# Pandas lets you do pretty much anything you want with data sets
# Typically, the most useful things are selecting, and merging
# Selecting lets you pick a particular row out of the data sets
# Merging (sometimes called joining) is combining two data sets into one
# Lets read in a data set

import pandas

In [5]:
# Data file is from http://introcs.cs.princeton.edu/java/data/
# and originally from the US Census bureau...
# https://www.census.gov/genealogy/www/data/2000surnames/
df = pandas.read_csv('surnames.csv')
# Pandas can also read many other kinds of data formats, e.g. excel

In [7]:
print df

           surname  percent  cumulative percent   rank
0            SMITH    1.006               1.006      1
1          JOHNSON    0.810               1.816      2
2         WILLIAMS    0.699               2.515      3
3            JONES    0.621               3.136      4
4            BROWN    0.621               3.757      5
5            DAVIS    0.480               4.237      6
6           MILLER    0.424               4.660      7
7           WILSON    0.339               5.000      8
8            MOORE    0.312               5.312      9
9           TAYLOR    0.311               5.623     10
10        ANDERSON    0.311               5.934     11
11          THOMAS    0.311               6.245     12
12         JACKSON    0.310               6.554     13
13           WHITE    0.279               6.834     14
14          HARRIS    0.275               7.109     15
15          MARTIN    0.273               7.382     16
16        THOMPSON    0.269               7.651     17
17        

In [6]:
# Lets see what columns the data set has

print df.columns

Index([u'surname', u'percent', u'cumulative percent', u'rank'], dtype='object')


In [9]:
df.surname == 'DIMITROV'

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
88769    False
88770    False
88771    False
88772    False
88773    False
88774    False
88775    False
88776    False
88777    False
88778    False
88779    False
88780    False
88781    False
88782    False
88783    False
88784    False
88785    False
88786    False
88787    False
88788    False
88789    False
88790    False
88791    False
88792    False
88793    False
88794    False
88795    False
88796    False
88797    False
88798    False
Name: surname, dtype: bool

In [10]:
# Lets see how popular of a name DIMITROV is, this is an example of selecting

df[ df.surname == 'DIMITROV' ]

Unnamed: 0,surname,percent,cumulative percent,rank
64556,DIMITROV,0.0,87.968,64557


In [11]:
# What about DIMITROFF?

df[ df.surname == 'DIMITROFF' ]

Unnamed: 0,surname,percent,cumulative percent,rank
57597,DIMITROFF,0.0,87.063,57598


In [15]:
import re
lstrings = ['hello', 'fred', 'a', 'apple', 'joe', 'microsoft']
for w in lstrings:
    ans = re.match('.*.e..*', w):
        print w

hello
fred


In [20]:
ans = re.search('(.*)(.)e(.)(.*)', 'fadsfajred')
print ans.groups()

('fadsfaj', 'r', 'd', '')


In [23]:
# Get all the people starting with D

# First we have to get rid of the null-valued surnames
df = df[ ~ df.surname.isnull() ]

# Then, we select the ones that start with D
df[ df.surname.str.match('D.*') ]
# You can also read about regular expressions here: https://docs.python.org/2/library/re.html

Unnamed: 0,surname,percent,cumulative percent,rank
5,DAVIS,0.480,4.237,6
98,DIAZ,0.084,18.742,99
138,DIXON,0.066,21.715,139
149,DANIELS,0.062,22.425,150
159,DUNN,0.058,23.022,160
176,DUNCAN,0.055,23.976,177
235,DEAN,0.045,26.925,236
244,DAY,0.043,27.322,245
256,DOUGLAS,0.041,27.827,257
261,DAVIDSON,0.041,28.032,262


In [31]:
# Ok, so DIMITROV is more popular... by the way, how do I pull out the percent cumulative?

df.ix[ df.surname == 'DIMITROV', 'cumulative percent'].values[0]

87.967999999999989

In [32]:
# Its weird that the column name has a space.  Lets change that.

cols = df.columns
print cols
cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, (str, unicode)) else x)
print cols
df.columns = cols
print df.columns

Index([u'surname', u'percent', u'cumulative percent', u'rank'], dtype='object')
['surname' 'percent' 'cumulative_percent' 'rank']
Index([u'surname', u'percent', u'cumulative_percent', u'rank'], dtype='object')


In [33]:
df.cumulative_percent 

0         1.006
1         1.816
2         2.515
3         3.136
4         3.757
5         4.237
6         4.660
7         5.000
8         5.312
9         5.623
10        5.934
11        6.245
12        6.554
13        6.834
14        7.109
15        7.382
16        7.651
17        7.905
18        8.140
19        8.372
20        8.603
21        8.832
22        9.058
23        9.278
24        9.497
25        9.698
26        9.897
27       10.090
28       10.282
29       10.472
          ...  
88769    90.480
88770    90.480
88771    90.480
88772    90.481
88773    90.481
88774    90.481
88775    90.481
88776    90.481
88777    90.481
88778    90.481
88779    90.481
88780    90.481
88781    90.481
88782    90.481
88783    90.482
88784    90.482
88785    90.482
88786    90.482
88787    90.482
88788    90.482
88789    90.482
88790    90.482
88791    90.482
88792    90.482
88793    90.483
88794    90.483
88795    90.483
88796    90.483
88797    90.483
88798    90.483
Name: cumulative_percent

In [34]:
# Now we can do:

df[ df.surname == 'DIMITROV' ].cumulative_percent.values[0]

87.967999999999989

In [35]:
# Ok, lets do some merging
df2 = pandas.read_csv('coolness.csv', skipinitialspace=True)
df2

Unnamed: 0,name,cool
0,DIMITROV,True
1,DIMITROFF,False


In [36]:
# We all know DIMITROV is cooler than DIMITROFF
# Now lets merge this with the original set

df.merge(df2, left_on=['surname'], right_on=['name'])

Unnamed: 0,surname,percent,cumulative_percent,rank,name,cool
0,DIMITROFF,0.0,87.063,57598,DIMITROFF,False
1,DIMITROV,0.0,87.968,64557,DIMITROV,True


In [37]:
# Where did all the other entries go?  Lets get them back...

merged = df.merge(df2, left_on=['surname'], right_on=['name'], how='left')
merged

Unnamed: 0,surname,percent,cumulative_percent,rank,name,cool
0,SMITH,1.006,1.006,1,,
1,JOHNSON,0.810,1.816,2,,
2,WILLIAMS,0.699,2.515,3,,
3,JONES,0.621,3.136,4,,
4,BROWN,0.621,3.757,5,,
5,DAVIS,0.480,4.237,6,,
6,MILLER,0.424,4.660,7,,
7,WILSON,0.339,5.000,8,,
8,MOORE,0.312,5.312,9,,
9,TAYLOR,0.311,5.623,10,,


In [38]:
merged.columns

# Ok, now lets set some value to all the other names...

# Have to be careful about Pandas copying data
# Doesn't work, gives us a warning
merged[merged.cool.isnull()]['cool'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [39]:
# Does work
merged.loc[ merged.cool.isnull(), 'cool'] = False

In [40]:
# What are the cool names?
merged[ merged.cool ]

Unnamed: 0,surname,percent,cumulative_percent,rank,name,cool
64554,DIMITROV,0.0,87.968,64557,DIMITROV,True


In [43]:
import pylab

In [51]:
# Great example :)

# Pandas does a ton more!  Just look online for what you need it to do
# For us, selecting is the main thing... merging only sometimes

###########################
# Plotting geographic data
###########################

import geoplotter

# Create a geoplotter object and draw the world
g = geoplotter.GeoPlotter()
g.clear()
g.drawWorld()

In [53]:
# Read shapefile data this shapefile is form the US Census Bureau:
# ftp://ftp2.census.gov/geo/pvs/tiger2010st/48_Texas/48/tl_2010_48_county10.zip
# I then unzipped it in the current working directory
g.readShapefile('tl_2010_48_county10', 'txCounty')

In [48]:
# Check out some of the info associated with a shape
print g.m.txCounty_info[0], g.m.txCounty[0]


{'NAME10': 'Dallas', 'METDIVFP10': '19124', 'CLASSFP10': 'H1', 'COUNTYNS10': '01383842', 'AWATER10': 96696353, 'RINGNUM': 1, 'ALAND10': 2256602704, 'INTPTLAT10': '+32.7669866', 'LSAD10': '06', 'SHAPENUM': 1, 'FUNCSTAT10': 'A', 'NAMELSAD10': 'Dallas County', 'CSAFP10': '206', 'COUNTYFP10': '113', 'CBSAFP10': '19100', 'STATEFP10': '48', 'MTFCC10': 'G4020', 'GEOID10': '48113', 'INTPTLON10': '-096.7784238'} [(-96.52998699999999, 32.545282), (-96.53329599999999, 32.545299), (-96.54674399999999, 32.545353999999996), (-96.553062, 32.545386), (-96.560311, 32.545443), (-96.56652199999999, 32.545525999999995), (-96.566716, 32.545529), (-96.573951, 32.545547), (-96.578103, 32.545595), (-96.582882, 32.545564), (-96.58729699999999, 32.545637), (-96.59192999999999, 32.545657999999996), (-96.597456, 32.545682), (-96.606854, 32.545685999999996), (-96.60714, 32.545687), (-96.607462, 32.545688999999996), (-96.607902, 32.545688), (-96.60830399999999, 32.545685), (-96.612856, 32.545704), (-96.613085, 32.5

In [None]:
# Draw the shapes, the second argument just says "draw all of them"
# What if we want to just draw Travis county in orange?
import scipy
g.drawShapes('txCounty', scipy.arange(len( g.m.txCounty_info )), edgecolor='black', facecolor='white', lw=3)
pylab.show()

In [None]:
# If you look at the code for drawShapes, it uses matplotlib.patches.Polygon to draw the shapes
# So you can get help on that type of object to see what options you have for drawing

# Redraw the world
g.redraw()

# You might have to write some code to zoom down to the area you want.  Perhaps read geoplotter.setZoom?

# Keep in mind that if you want to "draw the U.S" the country might be made of multiple polygons.  
# So, you need to draw all those polygons in the way you'd like to see them.

####################################
# Python debugger and profiler
####################################

# The python debugger is extremely useful to step through your code and read variables
# Use it with %run -d <python_script> from ipython

# %run is a ipython "magic" command.  We've already seen cpaste, its also an ipython command
# Read about the others with %magic

# prun lets you run the python profiler on statements, to see where/why your code is slow.


# the python debugger and profiler are also objects in the standard library, you can use them from there!

# finally, post-mortem debugging is really useful in ipython!
# e.g. divide by zero in a function, then drop into to see why

In [2]:
%magic 