## Exploring `Series` and `DataFrame` Objects
### Working with pandas
*Curtis Miller*

Let's create some `Series`.

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [2]:
ser1 = Series([1, 2, 3, 4])
ser2 = Series(['a', 'b', 'c'])
print(ser1)

0    1
1    2
2    3
3    4
dtype: int64


In [3]:
print(ser2)

0    a
1    b
2    c
dtype: object


In [4]:
# Create a pandas Index
idx = pd.Index(["New York", "Los Angeles", "Chicago",
                "Houston", "Philadelphia", "Phoenix", "San Antonio",
                "San Diego", "Dallas"])
print(idx)

Index(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Philadelphia',
       'Phoenix', 'San Antonio', 'San Diego', 'Dallas'],
      dtype='object')


In [5]:
pops = Series([8550, 3972, 2721, 2296, 1567, np.nan, 1470, 1395, 1300],
              index=idx, name="Population")
print(pops)

New York        8550.0
Los Angeles     3972.0
Chicago         2721.0
Houston         2296.0
Philadelphia    1567.0
Phoenix            NaN
San Antonio     1470.0
San Diego       1395.0
Dallas          1300.0
Name: Population, dtype: float64


In [6]:
state = Series({"New York": "New York", "Los Angeles": "California", "Phoenix": "Arizona", "San Antonio": "Texas",
                "San Diego": "California", "Dallas": "Texas"}, name = "State")
print(state)

Dallas              Texas
Los Angeles    California
New York         New York
Phoenix           Arizona
San Antonio         Texas
San Diego      California
Name: State, dtype: object


In [7]:
area = Series({"New York": 302.6, "Los Angeles": 468.7, "Philadelphia": 134.1, "Phoenix": 516.7, "Austin": 322.48},
              name = "Area")
print(area)

Austin          322.48
Los Angeles     468.70
New York        302.60
Philadelphia    134.10
Phoenix         516.70
Name: Area, dtype: float64


Let's see some of the ways we can create `DataFrame`s, first without indices.

In [8]:
# From a NumPy array
mat = np.arange(0,9).reshape(3, 3)
print(mat)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [9]:
print(DataFrame(mat))

   0  1  2
0  0  1  2
1  3  4  5
2  6  7  8


In [10]:
# Adding labels
print(DataFrame(mat, index=['a', 'b', 'c'], columns = ['alpha', 'beta', 'gamma']))

   alpha  beta  gamma
a      0     1      2
b      3     4      5
c      6     7      8


In [11]:
# What amounts to a 2D array (each tuple a row)
arr = [(1, 'a'), (2, 'b'), (3, 'c')]
print(arr)

[(1, 'a'), (2, 'b'), (3, 'c')]


In [12]:
print(DataFrame(arr, columns = ["Numbers", "Letters"]))

   Numbers Letters
0        1       a
1        2       b
2        3       c


In [13]:
# Creating from a dict
print(DataFrame({"Numbers": [1, 2, 3], "Letters": ['a', 'b', 'c']}))

  Letters  Numbers
0       a        1
1       b        2
2       c        3


In [14]:
# What if not all lists are the same length?
# We get an error
print(DataFrame({"Numbers": [1, 2, 3, 4], "Letters": ['a', 'b', 'c']}))

ValueError: arrays must all be same length

In [15]:
# Do we get an error?
DataFrame({"Numbers": ser1, "Letters": ser2})    # nan fills in "missing" information (Series not of same length)

Unnamed: 0,Letters,Numbers
0,a,1
1,b,2
2,c,3
3,,4


Let's now create a DataFrame containing information about cities.

In [16]:
# When passed as a list, series are treated as rows
# Notice that these Series are not the same length nor all have the same entries; nan will be generated
print(DataFrame([pops, state, area]))

            Austin  Chicago Dallas  Houston Los Angeles  New York  \
Population     NaN   2721.0   1300   2296.0        3972      8550   
State          NaN      NaN  Texas      NaN  California  New York   
Area        322.48      NaN    NaN      NaN       468.7     302.6   

            Philadelphia  Phoenix San Antonio   San Diego  
Population        1567.0      NaN        1470        1395  
State                NaN  Arizona       Texas  California  
Area               134.1    516.7         NaN         NaN  


In [17]:
print(DataFrame({"Population": pops, "State": state, "Area": area}))

                Area  Population       State
Austin        322.48         NaN         NaN
Chicago          NaN      2721.0         NaN
Dallas           NaN      1300.0       Texas
Houston          NaN      2296.0         NaN
Los Angeles   468.70      3972.0  California
New York      302.60      8550.0    New York
Philadelphia  134.10      1567.0         NaN
Phoenix       516.70         NaN     Arizona
San Antonio      NaN      1470.0       Texas
San Diego        NaN      1395.0  California


In [18]:
# Or, we could use DataFrame's T (transpose) method
print(DataFrame([pops, state, area]).T)

             Population       State    Area
Austin              NaN         NaN  322.48
Chicago            2721         NaN     NaN
Dallas             1300       Texas     NaN
Houston            2296         NaN     NaN
Los Angeles        3972  California   468.7
New York           8550    New York   302.6
Philadelphia       1567         NaN   134.1
Phoenix             NaN     Arizona   516.7
San Antonio        1470       Texas     NaN
San Diego          1395  California     NaN


How can we add new data to `Series` or `DataFrame`s?

In [19]:
# Let's append new data to each Series
pops.append(Series({"Seattle": 684, "Denver": 683}))     # Not done in place

New York        8550.0
Los Angeles     3972.0
Chicago         2721.0
Houston         2296.0
Philadelphia    1567.0
Phoenix            NaN
San Antonio     1470.0
San Diego       1395.0
Dallas          1300.0
Denver           683.0
Seattle          684.0
dtype: float64

In [20]:
df = DataFrame([pops, state, area]).T
df.append(DataFrame({"Population": Series({"Seattle": 684, "Denver": 683}),
                     "State": Series({"Seattle": "Washington", "Denver": "Colorado"}),
                     "Area": Series({"Seattle": np.nan, "Denver": np.nan})}))

Unnamed: 0,Area,Population,State
Austin,322.48,,
Chicago,,2721.0,
Dallas,,1300.0,Texas
Houston,,2296.0,
Los Angeles,468.7,3972.0,California
New York,302.6,8550.0,New York
Philadelphia,134.1,1567.0,
Phoenix,516.7,,Arizona
San Antonio,,1470.0,Texas
San Diego,,1395.0,California


In [21]:
pd.concat([df, DataFrame({"Numbers": Series(np.arange(9), index=pops.index),
                         "Letters": Series(['a', 'c', 'd', 'h', 'l', 'n', 'p', 'p', 's'], index=pops.index)})],
         axis=1)

Unnamed: 0,Population,State,Area,Letters,Numbers
Austin,,,322.48,,
Chicago,2721.0,,,d,2.0
Dallas,1300.0,Texas,,s,8.0
Houston,2296.0,,,h,3.0
Los Angeles,3972.0,California,468.7,c,1.0
New York,8550.0,New York,302.6,a,0.0
Philadelphia,1567.0,,134.1,l,4.0
Phoenix,,Arizona,516.7,n,5.0
San Antonio,1470.0,Texas,,p,6.0
San Diego,1395.0,California,,p,7.0


Finally we save the data to a CSV file for later use.

In [22]:
df = DataFrame([pops, state, area]).T
# Saving data to csv file
df.to_csv("cities.csv")