In [1]:
#Purpose of the project is to create a repo for using the basics of pandas dataframes 

## Dataframes

In [2]:
import pandas as pd 
import numpy as np

### How can we create dataframes?

In [3]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [4]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [5]:
pd.DataFrame(data, columns= ['year', 'state', 'pop']) #select the order of the columns appearing in the dataframe

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [6]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop'], index = [1,2,3,4,5,6]) # we can also assign the indices to suit our needs

In [7]:
frame2

Unnamed: 0,year,state,pop
1,2000,Ohio,1.5
2,2001,Ohio,1.7
3,2002,Ohio,3.6
4,2001,Nevada,2.4
5,2002,Nevada,2.9
6,2003,Nevada,3.2


In [8]:
frame2.year  # we can print the relevant elements within a dataframe 

1    2000
2    2001
3    2002
4    2001
5    2002
6    2003
Name: year, dtype: int64

In [9]:
frame2.index

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')

In [10]:
frame2.columns

Index(['year', 'state', 'pop'], dtype='object')

In [11]:
frame2['year'] #dot or [] syntax can be used to print columns

1    2000
2    2001
3    2002
4    2001
5    2002
6    2003
Name: year, dtype: int64

In [12]:
#for finding particular elements within a data frame there are 3 functions we can use: 
#iloc, loc, ix

In [13]:
frame2.loc[1]

year     2000
state    Ohio
pop       1.5
Name: 1, dtype: object

In [14]:
frame2['debt'] = np.arange(6.)

In [15]:
val2  = pd.Series([-1.2, -1.5, -1.7], index = [2,4,5]) #we can insert a pd series as a colomn in a dataframe

In [16]:
frame2['debt'] = val2

In [17]:
frame2 #this will give us nan values where we have incompletely assigned the indices

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,-1.2
3,2002,Ohio,3.6,
4,2001,Nevada,2.4,-1.5
5,2002,Nevada,2.9,-1.7
6,2003,Nevada,3.2,


In [18]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2                                      #boolean values can also be inserted
                                            #New columns cannot be created with the frame2.eastern syntax.

Unnamed: 0,year,state,pop,debt,eastern
1,2000,Ohio,1.5,,True
2,2001,Ohio,1.7,-1.2,True
3,2002,Ohio,3.6,,True
4,2001,Nevada,2.4,-1.5,False
5,2002,Nevada,2.9,-1.7,False
6,2003,Nevada,3.2,,False


In [19]:
del frame2['eastern']

In [20]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}  #nested dictionaries can also be used, outer keys act as columns and inner keys as indices

In [21]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [22]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}  #we can revert back by adding the data inside a new dictionary and performing indexing in the first dataframe

In [23]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [24]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [25]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [26]:
#Index Objects: these are responsible for holding the axis labels and other metadata (e.g. axis names)
#Note that indices are imutable objects 

In [27]:
labels = pd.Index(np.arange(3))

In [28]:
obj2 = pd.Series([-1.2, -1.5, -1.7], index = labels)

In [29]:
obj2.index is labels # indices can have duplicate values

True

In [30]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
        index=['a', 'c', 'd'],
        columns=['Ohio', 'Texas', 'California']) #we can also reindex a dataframe

In [31]:
states = ['Utah', 'Ohio', 'Texas']

In [32]:
#we can 'drop' data by using the drop() method: name_of_dataframe.drop([list of elements], axis = 0 |1, inplace = Trues|False )

In [33]:
#The stansard way to find elements in a datframe is via loc(labels) and iloc(integers), let's have a look at some examples 


In [34]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
       index=['Ohio', 'Colorado', 'Utah', 'New York'],
       columns=['one', 'two', 'three', 'four'])

In [35]:
data.loc['Ohio', ['one', 'two']]

one    0
two    1
Name: Ohio, dtype: int32

In [36]:
data.loc[['Ohio', 'Colorado'], ['one','two']]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5


In [37]:
data.iloc[0, [0,1]]

one    0
two    1
Name: Ohio, dtype: int32

In [38]:
data.loc['Ohio':'Colorado', 'one':] #slicing works with both loc and iloc

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [39]:
data.iloc[0:2, 0:] #not inclusive slicing in iloc

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [40]:
data.iloc[0:, 0:] [data.two >0]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [41]:
# Indexing, slicing and filtering in Dataframes and Series

In [42]:
data[['one', 'two', 'three']]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [43]:
#Indexing like the above comes with special cases

In [44]:
data[0:2] # there is no indexing version for the rows  -> keyerror

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [45]:
data[data['one'] >5]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [46]:
data < 5 #This will result in a boolean array 

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [47]:
#for series what changes from the above notation is that slicing is inclusive 
#also remember that slicing is usually a  view while indexing creates a copy of the initial array-like object

### Integer Indexes 

In [48]:
ser = pd.Series(np.arange(3.))
ser
ser[-1]  # -> error keytype

KeyError: -1

In [49]:
#to be consistent in this case we have integer index values it preferable to use loc or iloc

### Apply and mapping 

In [50]:
frame = pd.DataFrame(np.random.randn(4,3), index = list('abcd'), columns = ["Utah", "Ohio", "Texas"])
frame 

Unnamed: 0,Utah,Ohio,Texas
a,-0.212626,1.543201,-0.419882
b,0.086709,0.169106,-0.943229
c,0.398068,1.543238,0.762741
d,-0.854104,-0.03435,-1.070579


In [51]:
f = lambda x: x.max() - x.min() 

In [52]:
frame.apply(f) # applies f per column 

Utah     1.252172
Ohio     1.577588
Texas    1.833320
dtype: float64

In [53]:
frame.apply(f, axis = 1) #same as calling axis = "columns"

a    1.963083
b    1.112334
c    1.145170
d    1.036229
dtype: float64

In [54]:
#we can also return apart a scalar value a one dimensional array i.e. a Series

In [55]:
def f(x):
    return pd.Series([x.max(), x.min(), x.sum()], index = ['max', 'min', 'sum'])

In [56]:
frame.apply(f)

Unnamed: 0,Utah,Ohio,Texas
max,0.398068,1.543238,0.762741
min,-0.854104,-0.03435,-1.070579
sum,-0.581952,3.221195,-1.670949


In [57]:
#we can parse through each element performing a predefined operation by using the applymap() method

In [58]:
round_to_two = lambda x: "%.2f" %x 

In [59]:
frame.applymap(round_to_two)

Unnamed: 0,Utah,Ohio,Texas
a,-0.21,1.54,-0.42
b,0.09,0.17,-0.94
c,0.4,1.54,0.76
d,-0.85,-0.03,-1.07


In [60]:
#for perfrorming the same operations on a Series we use the map() method

In [61]:
frame["Utah"].map(round_to_two)

a    -0.21
b     0.09
c     0.40
d    -0.85
Name: Utah, dtype: object

### Sorting and Ranking

In [62]:
obj = pd.Series(range(4), index = [ 'b', 'a','c', 'd'])
obj 

b    0
a    1
c    2
d    3
dtype: int64

In [63]:
obj.sort_index() # we can also have the option to sort by values: sort_values()

a    1
b    0
c    2
d    3
dtype: int64

In [64]:
frame = pd.DataFrame(np.arange(8).reshape(2,4), index = ['a','b'], columns = ["Utah", "Ohio", "Texas" , "Oregon"] )
frame

Unnamed: 0,Utah,Ohio,Texas,Oregon
a,0,1,2,3
b,4,5,6,7


In [65]:
frame.sort_index(axis=1)

Unnamed: 0,Ohio,Oregon,Texas,Utah
a,1,3,2,0
b,5,7,6,4


In [66]:
frame.sort_index(axis = 1, ascending  = False)

Unnamed: 0,Utah,Texas,Oregon,Ohio
a,0,2,3,1
b,4,6,7,5


In [67]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [68]:
frame.sort_values(by= ['b','a']) # we can sort a dataframe by the values of one or more than one of its columns

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


### Descriptive statistics 

options for reductions methods (sum, mean, etc): 
  - axis (0, 1)
  - skipna (False, True)
  - level 

idmax(), idmin() can be used to find the maximum and minimum values in each column of the dataframe 

 cumsum() can perform an accumulation over each of the columns 


describe() give us a general overview of our data within our dataframe (mean, deviation, percentiles, count)

corr() and cov() can be used in Series as well as a Dataframe

<h2> Grouping data </h2>

<h3> Group by Mechanics </h3>

In [69]:
import psycopg2
conn  = psycopg2.connect(dbname = 'postgres', user = 'postgres', password = 'durham17')
conn.close()

In [70]:
import pandas as pd 
import numpy as np

df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
     'key2' : ['one', 'two', 'one', 'two', 'one'],
     'data1' : np.random.randn(5),
     'data2' : np.random.randn(5)})

In [71]:
grouped = df['data1'].groupby(df['key1']) #identify the column on which we will perform the grouping 
#                                            this will usuallly be a colummn within the same dataframe
#                                            therefore, a simple 'key1' also makes up for it

In [72]:
grouped #nothing yet calculated, the object is simply now a Grouped object
        #allowing us to perform any operation on the grouped data like sum, avg etc on the group 

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025AFB75A1C8>

In [73]:
grouped.sum()

key1
a   -1.565171
b   -0.013702
Name: data1, dtype: float64

In [74]:
new_grouped = df['data1'].groupby([df['key1'], df['key2']]).sum()
new_grouped #usually you want to perform the grouping in terms of a column 
            #alrady existing in the dataframe, hence you just pass the labels as an argument 
            #in the group by function

key1  key2
a     one    -1.136650
      two    -0.428520
b     one     0.554506
      two    -0.568208
Name: data1, dtype: float64

In [75]:
new_grouped.unstack() #since we have multiple layers of indices we can play around by unstacking 

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.13665,-0.42852
b,0.554506,-0.568208


In [76]:
df.groupby(['key1', 'key2']).size() #count the elements within each group

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [77]:
for (k1,k2), group in df.groupby(['key1', 'key2']):
    print(k1,k2)
    print(group)
#groupby object is iterable, therefore we can parse it with a for 
#getting the each separate key and its chunk of data

a one
  key1 key2     data1     data2
0    a  one -1.017201  1.112984
4    a  one -0.119450 -2.358246
a two
  key1 key2    data1     data2
1    a  two -0.42852  1.179647
b one
  key1 key2     data1     data2
2    b  one  0.554506  0.364482
b two
  key1 key2     data1     data2
3    b  two -0.568208 -1.178512


In [78]:
pieces = list(df.groupby('key1')) #since groupby is iterable we cast the object into list like 
                                  #objects i.e. lists, tuples, dicts 

In [79]:
pieces

[('a',
    key1 key2     data1     data2
  0    a  one -1.017201  1.112984
  1    a  two -0.428520  1.179647
  4    a  one -0.119450 -2.358246),
 ('b',
    key1 key2     data1     data2
  2    b  one  0.554506  0.364482
  3    b  two -0.568208 -1.178512)]

In [80]:
pieces = dict(pieces)

In [81]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one -1.017201  1.112984
 1    a  two -0.428520  1.179647
 4    a  one -0.119450 -2.358246,
 'b':   key1 key2     data1     data2
 2    b  one  0.554506  0.364482
 3    b  two -0.568208 -1.178512}

In [82]:
grouped = df.groupby(df.dtypes, axis = 1) #remember that groupby groups by default on 0 axis 
                                          #that said, along the rows

for data_types, data in grouped: #group data by their data types
    print (data_types)
    print(data)

float64
      data1     data2
0 -1.017201  1.112984
1 -0.428520  1.179647
2  0.554506  0.364482
3 -0.568208 -1.178512
4 -0.119450 -2.358246
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [83]:
for data_types, data in grouped: #group data by their data types
    print (data_types)
    print(data)

float64
      data1     data2
0 -1.017201  1.112984
1 -0.428520  1.179647
2  0.554506  0.364482
3 -0.568208 -1.178512
4 -0.119450 -2.358246
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [84]:
df.groupby('key1').sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.565171,-0.065614
b,-0.013702,-0.81403


In [85]:
df.groupby('key1')['data1'].sum() #brings out a series

key1
a   -1.565171
b   -0.013702
Name: data1, dtype: float64

In [86]:
    df.groupby('key1')[['data1']].sum() #brings out a dataframe

Unnamed: 0_level_0,data1
key1,Unnamed: 1_level_1
a,-1.565171
b,-0.013702


<h3> Data Aggregation </h3>

<p>  Data aggregation functions are simply the functions that act on multidimensional 
    arrays to return a scalar value. Functions like these inclde sum(), average(), count(), 
    min(), max(), median(), var(), std(), etc...
   

<p> One can define his own aggregation function by defining the function itself and then passing its process to the data via the .agg() method e.g.:
    
    def peak_to_peak(arr):
        return arr.max() - arr.min()
    grouped.agg(peak_to_peak) 
    
in case the function to be used is already a built-in Python function then we can pass it through a 
string e.g.:
    
    grouped.agg('sum')


<p> Once we group by a dataframe by two or more column element (this will result in high level layered
    index). We can aggregate by particular functions (built-in or user defined). Just pass the    
    functions as list like object within .agg().
    
<p> Here we can also give our own defined names for each of these functions, these will then appear as 
    the names of the columns in the new dataframe. Syntax is implemented as 2-tuple e.g.:
    
        grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

<p> In case we want to return back the dataframe with a reset index after performing the grouping 
    we can add an optional argument the as_index = False (or call the dataframe.reset_index in the 
    next line)

<h3> General split - apply -combine  </h3>

In [87]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
        'data2': np.random.randn(1000)}) 

In [88]:
frame

Unnamed: 0,data1,data2
0,0.652819,-0.058960
1,0.670352,1.015731
2,1.329599,0.572558
3,0.303782,-1.916177
4,0.571155,1.046531
...,...,...
995,-0.833331,0.238533
996,1.009190,0.979217
997,1.101489,-0.759161
998,-0.024239,0.984395


In [89]:
quartiles = pd.cut(frame.data1, 4) #let's break the data into 4 buckets of equal 
                                   #length 

In [90]:
quartiles[:10]

0     (-0.206, 1.457]
1     (-0.206, 1.457]
2     (-0.206, 1.457]
3     (-0.206, 1.457]
4     (-0.206, 1.457]
5     (-0.206, 1.457]
6     (-0.206, 1.457]
7     (-0.206, 1.457]
8    (-1.869, -0.206]
9     (-0.206, 1.457]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.539, -1.869] < (-1.869, -0.206] < (-0.206, 1.457] < (1.457, 3.12]]

<p> Therefore, cut returns a categorical object that can in turn be passed in a groupby method
    to come up with an elementary analysis.

In [91]:
def get_stats(group):
       return {'min': group.min(), 'max': group.max(),
       'count': group.count(), 'mean': group.mean()} #create a dict that will return a frame with 
                                                     #included values/results of functions 

In [92]:
grouped = frame.data2.groupby(quartiles).apply(get_stats)

In [93]:
grouped #we would like to unstack this so that it will look closer to a dataframe

data1                  
(-3.539, -1.869]  min       -1.637290
                  max        2.070750
                  count     34.000000
                  mean      -0.087577
(-1.869, -0.206]  min       -2.822903
                  max        2.805152
                  count    401.000000
                  mean      -0.021822
(-0.206, 1.457]   min       -3.262447
                  max        2.818179
                  count    505.000000
                  mean      -0.002780
(1.457, 3.12]     min       -2.025643
                  max        1.788671
                  count     60.000000
                  mean      -0.032764
Name: data2, dtype: float64

In [94]:
grouped.unstack() #this will add a higher column layer 

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.539, -1.869]",-1.63729,2.07075,34.0,-0.087577
"(-1.869, -0.206]",-2.822903,2.805152,401.0,-0.021822
"(-0.206, 1.457]",-3.262447,2.818179,505.0,-0.00278
"(1.457, 3.12]",-2.025643,1.788671,60.0,-0.032764


In [95]:
grouping = pd.qcut(frame.data1, 10, labels = False) #take out the labels to just show the quartiles

In [96]:
grouped = frame.data1.groupby(grouping).apply(get_stats)

In [97]:
grouped.unstack() #unstack the Series like object and voila

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-3.531863,-1.325141,100.0,-1.809408
1,-1.324112,-0.886514,100.0,-1.098356
2,-0.886363,-0.565956,100.0,-0.717074
3,-0.564046,-0.301776,100.0,-0.449731
4,-0.299696,-0.043003,100.0,-0.169488
5,-0.040838,0.180067,100.0,0.065581
6,0.181969,0.44871,100.0,0.310862
7,0.450843,0.743548,100.0,0.602451
8,0.76054,1.212365,100.0,0.988331
9,1.212925,3.120216,100.0,1.69716


<p> Let's come back to the context of missing values, in general we will either drop these values 
    (usually this is the case where our analysis is not affected by these values) or fill these 
    missing data with values coming from the frame (usually the mean, median or most frequent value 
    in case of a categorical variable).

In [98]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan #generate missing values in the Series

In [99]:
s

0         NaN
1   -0.247343
2         NaN
3   -0.950699
4         NaN
5    0.883996
dtype: float64

In [100]:
s.fillna(s.mean())

0   -0.104682
1   -0.247343
2   -0.104682
3   -0.950699
4   -0.104682
5    0.883996
dtype: float64

In [101]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
        'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4

In [102]:
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [103]:
data = pd.Series(np.random.randn(8), index=states)
data

Ohio          0.155675
New York     -0.594626
Vermont      -0.434221
Florida      -0.057665
Oregon        0.288092
Nevada       -0.719385
California   -0.721154
Idaho         0.312496
dtype: float64

In [104]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan

In [105]:
data

Ohio          0.155675
New York     -0.594626
Vermont            NaN
Florida      -0.057665
Oregon        0.288092
Nevada             NaN
California   -0.721154
Idaho              NaN
dtype: float64

In [106]:
fill_mean = lambda g: g.fillna(g.mean())

In [107]:
data.groupby(group_key).apply(fill_mean) #we just replace the missing value for west/east city 
                                         #with the mean from for west/east

Ohio          0.155675
New York     -0.594626
Vermont      -0.165538
Florida      -0.057665
Oregon        0.288092
Nevada       -0.216531
California   -0.721154
Idaho        -0.216531
dtype: float64

In [108]:
data.groupby(group_key).mean()

East   -0.165538
West   -0.216531
dtype: float64

In [109]:
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name]) #take advantage of the internal
                                                    #attribute g.name of a groupby object?
data.groupby(group_key).apply(fill_func)

Ohio          0.155675
New York     -0.594626
Vermont       0.500000
Florida      -0.057665
Oregon        0.288092
Nevada       -1.000000
California   -0.721154
Idaho        -1.000000
dtype: float64

 <h2> Missing Values</h2>

<p> The usual notation for missing values is NaN. </p>

<p> Functions we can use include<p\>
    <ul>
     <li> dropna() select the data that we would like to drop
     <li> fillna() fill missing values with a desired values like a mean or the most frequent value
     <li> isnull() check the entries of the data in the dataframe where we have missing values
    <\ul>

<h3> Droping data </h3>

In [110]:
from numpy import nan as NA 
import pandas as pd
import numpy as np 

In [111]:
data = pd.Series([1, NA, 3.5, NA, 7])  

In [112]:
data[data.isnull()] #returns values and indices of the Series where we have missing values 

1   NaN
3   NaN
dtype: float64

In [113]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [114]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [115]:
data.dropna(how  = "all") #drops the rows constituting only of missing data 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [116]:
data[4] = NA

In [117]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [118]:
data.dropna(how = "all", axis = 1)
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [119]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.400114,0.548594,0.085126
1,-0.032214,0.149998,1.318953
2,0.57522,0.679983,0.854839
3,0.235571,0.288855,-0.491459
4,0.268334,-1.396964,-0.799102
5,-1.425641,-0.253893,-0.656505
6,-0.678152,-0.525448,-0.134563


In [120]:
df.iloc[:4,1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.400114,,
1,-0.032214,,
2,0.57522,,0.854839
3,0.235571,,-0.491459
4,0.268334,-1.396964,-0.799102
5,-1.425641,-0.253893,-0.656505
6,-0.678152,-0.525448,-0.134563


In [121]:
df.dropna(thresh = 2) #select rows with a certain number of observations i.e. columns with two non missing values

Unnamed: 0,0,1,2
2,0.57522,,0.854839
3,0.235571,,-0.491459
4,0.268334,-1.396964,-0.799102
5,-1.425641,-0.253893,-0.656505
6,-0.678152,-0.525448,-0.134563


<h3> Filling data </h3>

In [122]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.400114,0.0,0.0
1,-0.032214,0.0,0.0
2,0.57522,0.0,0.854839
3,0.235571,0.0,-0.491459
4,0.268334,-1.396964,-0.799102
5,-1.425641,-0.253893,-0.656505
6,-0.678152,-0.525448,-0.134563


In [123]:
df.fillna({1:0.5, 2:0.7}) #note that fillna returns a new object by default

Unnamed: 0,0,1,2
0,-0.400114,0.5,0.7
1,-0.032214,0.5,0.7
2,0.57522,0.5,0.854839
3,0.235571,0.5,-0.491459
4,0.268334,-1.396964,-0.799102
5,-1.425641,-0.253893,-0.656505
6,-0.678152,-0.525448,-0.134563


In [124]:
df.fillna(0, inplace = True)
df

Unnamed: 0,0,1,2
0,-0.400114,0.0,0.0
1,-0.032214,0.0,0.0
2,0.57522,0.0,0.854839
3,0.235571,0.0,-0.491459
4,0.268334,-1.396964,-0.799102
5,-1.425641,-0.253893,-0.656505
6,-0.678152,-0.525448,-0.134563


In [125]:
df = pd.DataFrame(np.random.randn(6, 3))

In [126]:
df.iloc[4:, 1] =NA
df.iloc[2:,2] = NA
df

Unnamed: 0,0,1,2
0,1.158679,2.186836,0.148319
1,-0.51514,0.308383,0.186593
2,-0.751292,-0.472079,
3,1.266766,-0.234265,
4,0.873312,,
5,-1.059295,,


In [127]:
df.fillna(method = "ffill")
df

Unnamed: 0,0,1,2
0,1.158679,2.186836,0.148319
1,-0.51514,0.308383,0.186593
2,-0.751292,-0.472079,
3,1.266766,-0.234265,
4,0.873312,,
5,-1.059295,,


In [128]:
df.fillna(method = "ffill", limit = 2)

Unnamed: 0,0,1,2
0,1.158679,2.186836,0.148319
1,-0.51514,0.308383,0.186593
2,-0.751292,-0.472079,0.186593
3,1.266766,-0.234265,0.186593
4,0.873312,-0.234265,
5,-1.059295,-0.234265,


In [129]:
data = pd.Series([1,NA,4,NA, 3.5])
data

0    1.0
1    NaN
2    4.0
3    NaN
4    3.5
dtype: float64

In [130]:
data.fillna(data.mean()) #replace the missing values with the mean value, another candidate is the median

0    1.000000
1    2.833333
2    4.000000
3    2.833333
4    3.500000
dtype: float64

<h3> Ways to fill missing values </h3>
            

 <p>
    <ul>
        <li> fill missing values with a desired number, usually the mean or median 
        <li> method back filling bfill or forward filling ffill 
        <li> limit, use that in interpolation with the ffill or bfill method
        <li> remember that the fillna method (as well as the dropna) returns a new object. Inplace = True operates on the initial dataframe
    <ul\>
 <p\>

<h3> Duplicated Values

In [131]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
       'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [132]:
data.duplicated()  #checks if a row is duplicate value of another row

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [133]:
data.drop_duplicates() # simply drop the duplicated rows

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [134]:
data['v1'] = range(7)

In [135]:
data.drop_duplicates(['k1']) #select the column on which you  will drop the duplicates
data                         #again notice that the drop_duplicates() creates a new object

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [136]:
# drop_duplicates() returns the first observed values and cuts down the subsequent duplicates
# we can select to cut down the first occurence of the sequence and return the last values

In [137]:
data.drop_duplicates(['k1', 'k2'], keep = 'last') # this will cut down the index 5 but will return the index 6 


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


<h3> Transforming Data with a function or a mapping <\h3>

In [138]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
      'Pastrami', 'corned beef', 'Bacon',
      'pastrami', 'honey ham', 'nova lox'],
      'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [139]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
} #consider that we would like to change via mapping the food to the its raw material 

In [140]:
lower_string = data['food'].str.lower() #make the strings from mixed lowercased/uppercased to lowercased, 
                                        #notice that str.lower() is a Series method

In [141]:
lower_string

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [142]:
data['animal']  = lower_string.map(meat_to_animal) # a Series can accept through mapping a dict for changing its elements to the values of the dict, elements and keys should be 1-1.

In [143]:
data['animal'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


<p> Therefore, we see that map() can transform a subset of data to a format of our desire, however there are
    easier and more flexible ways go do so, one of them being the replace() function.
<p\>

In [144]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [145]:
data.replace(-999, NA)
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [146]:
data.replace([-999,-1000], NA) #replace a list of elements with one value

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [147]:
data.replace([-999, -1000], [NA, 0]) #replac the elements of a list with another list by 1-1

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [148]:
data.replace({-999: 0, -1000: NA}) # replacement can also occur via dict
                                  #notice that the method replace() is different from the str.replace() 
                                  # the last one is  a string substitution element-wise

0    1.0
1    0.0
2    2.0
3    0.0
4    NaN
5    3.0
dtype: float64

In [149]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
       index=['Ohio', 'Colorado', 'New York'],
       columns=['one', 'two', 'three', 'four'])

In [150]:
transform  = lambda x: x[:4].upper()

In [151]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [152]:
data.rename(index = str.title, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [153]:
data.rename(index={'Ohio': 'Indiana'},
            columns={'three': 'peekaboo'}, #inplace = True/False
           )

Unnamed: 0,one,two,peekaboo,four
Indiana,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


<h3> Discretization and Binning </h3>

In [154]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
group_names = ['millenia', 'young_adult', 'middle-age', 'senior']
cats = pd.cut(ages, bins, labels = group_names, right = False, #precision = number [numerical data]
             )
cats

[millenia, millenia, young_adult, young_adult, millenia, ..., young_adult, senior, middle-age, middle-age, young_adult]
Length: 12
Categories (4, object): [millenia < young_adult < middle-age < senior]

In [155]:
cats.categories

Index(['millenia', 'young_adult', 'middle-age', 'senior'], dtype='object')

In [156]:
cats.codes

array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [157]:
pd.value_counts(cats)

young_adult    4
millenia       4
middle-age     3
senior         1
dtype: int64

In [158]:
#instead of bins you can cut into quartiles i.e. 
#pd.cut(data, 4)
#pd.cat(data, [0.2,0.4,0.6,0.8,1.]) create your own percentiles 

In [159]:
data = pd.DataFrame(np.random.randn(1000, 4))
outliers = data[(np.abs(data) >3).any(1)]  #find the outliers in the entire dataframe 
outliers

Unnamed: 0,0,1,2,3
17,0.241411,-0.667785,-0.941209,-3.322142
197,0.204609,0.624405,-0.334895,-3.430673
413,-0.649874,-0.303591,-3.118357,-2.462393
428,-0.363687,0.167898,1.887344,3.102743
454,3.169389,1.173901,0.375108,0.46096
573,-0.805843,-3.356144,-2.329955,-0.950667
577,-3.213599,0.406201,1.349426,1.047511
617,-1.180678,0.327316,-3.045372,-0.61971
821,-0.907708,0.591981,1.562688,3.204768
836,-0.297227,-3.260303,-1.372317,1.386226


<h3> Permutation and Sampling </h3>

In [160]:
df = pd.DataFrame(np.arange(20).reshape((5,4)))

In [161]:
sampler = np.random.permutation(5)
sampler

array([2, 1, 4, 3, 0])

In [162]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3


In [163]:
df.sample(n=10 ,replace=True) #replace allows for repetitions

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19


<h3> Dummy Variables </h3>

In [170]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [171]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]


In [172]:
pd.get_dummies(pd.cut(values, bins)) # this will introduce dummy variables in each cateogorization
                                     #1 means existence of the value and 0 means otherwise

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


<h3> String Manupulation</h3>

In [173]:
#split() and strip() can be used together to put strings in a list and then trim any whitespaces 

In [174]:
val = "a, b, guide" 

In [175]:
list_val = val.split(",")
list_val

['a', ' b', ' guide']

In [176]:
no_space_ListVal = [x.strip() for x in list_val] # strip will trim any white spaces 
no_space_ListVal

['a', 'b', 'guide']

In [177]:
#check  the directory of functions for string manipulation in mckinney 

<h3> <i>Regexp </i></h3>

In [178]:
#regexp can be used for pattern matching, substitution and splitting 

<h2> Hierarchical Indexing </h2>

<p> Hierarchical indexing is useful for reshaping data and manipulating group based operations (like pivot tables)
 <p\>

In [179]:
import pandas as pd 
import numpy as np

<h3> Inner indexing  </h3>

In [180]:
data = pd.Series(np.random.randn(9),
...: index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
...: [1, 2, 3, 1, 3, 1, 2, 2, 3]])

data

a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64

In [181]:
data.loc[:, 2] #inner indexing 

a   -1.296221
c   -2.001637
d   -0.371843
dtype: float64

In [182]:
data.unstack() # reshape my Series to a Dataframe

Unnamed: 0,1,2,3
a,1.007189,-1.296221,0.274992
b,0.228913,,1.352917
c,0.886429,-2.001637,
d,,-0.371843,1.669025


In [183]:
data.unstack().stack() #turn back to the initial Series

a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64

In [184]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
....: index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
....: columns=[['Ohio', 'Ohio', 'Colorado'],
....: ['Green', 'Red', 'Green']])

frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [185]:
frame.index.names = ['key1', 'key2'] # set the names for the multi-index
frame.columns.names = ['State', 'Color'] # set the names for the column 

frame

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


<h3> Aggregating values at a certain level </h3>

In [186]:
frame.sum(level = 'key1')

State,Ohio,Ohio,Colorado
Color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [187]:
frame.sum(level= 'State', axis = 1)

Unnamed: 0_level_0,State,Ohio,Colorado
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,2
a,2,7,5
b,1,13,8
b,2,19,11


In [188]:
frame.swaplevel(0,1).sort_index(0) 
#could also be written as frame.swaplevel('key1', 'key2').sort_index(0)

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


<h3> Deeper indexing using the dataframe's columns </h3>

In [189]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
....: 'c': ['one', 'one', 'one', 'two', 'two',
....: 'two', 'two'],
....: 'd': [0, 1, 2, 0, 1, 2, 3]})

In [190]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [191]:
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [192]:
frame.set_index(['c','d'], drop = False) 
#hold the initial columns of the dataframe
frame2.index.names = ['first' ,'second']
#we can now rename the index names so that we do not 
# mess up with the column names
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [193]:
frame2.reset_index() 
#return back to the initial object

Unnamed: 0,first,second,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


<h3> Combining and merging datasets </h3>

<ul>
    <li> pandas.merge() merges rows of different dataframes based on index values 
    <li> pandas.concat() simply 'stacks' together objects along an axis (0,1) 
    <li> combine_first fills values of one object with values from another object 

In [194]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
....: 'data1': range(7)})

df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [195]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
....: 'data2': range(3)})

df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [196]:
pd.merge(df1,df1)  #many to one join

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [197]:
pd.merge(df1, df2, on = "key") #good practice to specify on which column we will merge 

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [198]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
....: 'data1': range(7)})
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [199]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
....: 'data2': range(3)})

df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [200]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey') #different keys so we specify each of them 

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


<p> Notice that until now merge() does by default an inner join, we need to specify excplicitly 
    an outer join.

In [201]:
pd.merge(df1, df2, how = "outer") #outer join, this obviously gives us some NaN values
                                  #in the non overlaping data

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


<ul>
    <li> merge() can have non intuitive results on a many-to-many relationship between two 
        dataframes, usually in this occasion if three rows appear with the same values in two rows of 
        second dataframe then the fibal output will be 3x2 = rows.
    <li> merge() can also be used between overlaping indexes or a mix of index and column values 
         in this occasion syntax can go like pd.merge(df1, df2, lkey = "...", right_index = "..." how =          ['inner' |'outer'| 'left'])  
    <li> the .join can be applied by default on indices and in more than two dataframes.
        Syntax goes like df1.join(df2, how = "...", on = " "). Where the on attribute is used in case
        we want to join the index of the passed dataframe with the column of the called dataframe.
       

<h3> Concat dataframes and series </h3>

<p> concat() works by default along 0 axis.

In [202]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [203]:
pd.concat([s1,s2,s3] )

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [204]:
pd.concat([s1,s2,s3], axis=1 )

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [205]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [206]:
pd.concat([s1,s4], axis = 1, join= 'inner') #take the intersectio of concatenation 

Unnamed: 0,0,1
a,0,0
b,1,1


In [207]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
....: columns=['one', 'two'])

df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
....: columns=['three', 'four'])

In [208]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], join= 'inner')

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5,6
c,4,5,7,8


In [209]:
# pd.concat({'level1': df1, 'level2': df2}, axis=1) 
# same as
# pd.concat([df1, df2], keys = ['level1, level2'], axis = 1)

In [210]:
 df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

In [211]:
pd.concat([df1,df2], ignore_index= True) #reformating the index

Unnamed: 0,a,b,c,d
0,-0.43857,-0.539741,0.476985,3.248944
1,-1.021228,-0.577087,0.124121,0.302614
2,0.523772,0.00094,1.34381,-0.713544
3,-1.860761,-0.831154,,-2.370232
4,-1.265934,-0.860757,,0.560145


<h3> Combining data with overlap </h3>

<p> combine_first() in the case of a dataframe 'patches' missing data column by column in the calling object with data from the object you pass.
    


<h3> Reshaping and pivoting  </h3>

In [212]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
.....: index=pd.Index(['Ohio', 'Colorado'], name='state'),
.....: columns=pd.Index(['one', 'two', 'three'],
.....: name='number'))

In [213]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [214]:
result = data.stack()
result    #stack rotates the columns into rows resulting in hierarchical intexed Series 

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [215]:
result.unstack() #by default the innnermost level will remain unstack

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [216]:
result.unstack(0) #select which level I would like to unstack

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


<p> Unstacking can introduce missing values  

In [217]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])

data2 = pd.concat([s1,s2], keys = ['key1', 'key2'])
data2

key1  a    0
      b    1
      c    2
      d    3
key2  c    4
      d    5
      e    6
dtype: int64

In [218]:
data2.unstack() #innermost level would get the same results as the number of the values 
                #within the innermost level are not the same. 

Unnamed: 0,a,b,c,d,e
key1,0.0,1.0,2.0,3.0,
key2,,,4.0,5.0,6.0


In [219]:
data2.unstack().stack() #from here we see that the stack method by default drops values 

key1  a    0.0
      b    1.0
      c    2.0
      d    3.0
key2  c    4.0
      d    5.0
      e    6.0
dtype: float64

In [220]:
data2.unstack().stack(dropna = False) 

key1  a    0.0
      b    1.0
      c    2.0
      d    3.0
      e    NaN
key2  a    NaN
      b    NaN
      c    4.0
      d    5.0
      e    6.0
dtype: float64

In [221]:
df = pd.DataFrame({'left': result, 'right': result + 5},
.....: columns=pd.Index(['left', 'right'], name='side'))

In [222]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [223]:
df.unstack(0)

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [224]:
df.unstack(0).stack(0) #unstacking and stacking at the same level can bring different results

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7
