In [2]:
import numpy as np
import pandas as pd

In [3]:
#A DataFrame is a collection of Series objects
#Series object from a Python dictionary
population_dict = {'California': 38332521,'Texas': 26448193,'New York': 19651127,
                    'Florida': 19552860,'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [4]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
                'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [5]:
#DF from Series
data = pd.DataFrame({'population': population,
'area': area})
data

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [6]:
#Series attribues
print(states.size, states.shape, states.ndim)

NameError: name 'states' is not defined

In [None]:
#Indexing and Selection
data['area'] #data.area

In [None]:
data.values

In [7]:
data.iloc[:3, :2]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [8]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [9]:
#Detecting null values
data = pd.Series([1, np.nan, 'hello', None])

In [10]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [11]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [12]:
#Dropping null values
data.dropna()

0        1
2    hello
dtype: object

In [13]:
#Nulls in DataFrame
df = pd.DataFrame([[1, np.nan, 2],
                    [2, 3, 5],
                    [np.nan, 4, 6]])

In [14]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [15]:
#By default, dropna() will drop all rows in which any null value is present in DF
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [16]:
#Alternatively, you can drop NA values along a different axis; axis=1 drops all columns
#containing a null value
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [17]:
#the how or thresh parameters
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [18]:
df.dropna(axis='columns', how='all') #will only drop rows/columns that are all null values


Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [19]:
#the thresh parameter lets you specify a minimum number
#of non-null values for the row/column to be kept
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [20]:
#Filling null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [21]:
#We can fill NA entries with a single value, such as zero:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [22]:
#We can specify a forward-fill to propagate the previous value forward
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [23]:
#we can specify a back-fill to propagate the next values backward
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [24]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [25]:
#Filling Missing Data DataFrames in

df.fillna(method='ffill', axis=1) #exis 1=rows 0=columns

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [26]:
#Pandas MultiIndex
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]

In [27]:
index

[('California', 2000),
 ('California', 2010),
 ('New York', 2000),
 ('New York', 2010),
 ('Texas', 2000),
 ('Texas', 2010)]

In [28]:
populations

[33871648, 37253956, 18976457, 19378102, 20851820, 25145561]

In [29]:
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [30]:
pop[('California',2010):('Texas', 2000)] #the BAD WAY

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [31]:
#the pandas way

index = pd.MultiIndex.from_tuples(index)#create a multi-index from the tuples
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [32]:
pop = pop.reindex(index) #reindex our series with MultiIndex
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [33]:
#MultiIndex as extra dimension
#The unstack() method will quickly convert a multiplyindexed
#Series into a conventionally indexed DataFrame
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [34]:
#the stack() method provides the opposite operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [35]:
#Indexing and Slicing a MultiIndex
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [36]:
pop.loc['California':'New York']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [37]:
pop[pop > 22000000]

California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [38]:
#Stacking and unstacking indices
pop.unstack(level=0)


Unnamed: 0,California,New York,Texas
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561
