In [2]:
"""
Concatenating data in multiple pandas objects
Merging data in multiple pandas objects
How to control the type of join used in a merge
Pivoting data to and from values and indexes
Stacking and unstacking data
Melting data to and from wide and long format
"""


In [3]:
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt

# concatenation

In [5]:
#concat 2 series
s1 = pd.Series (np.arange(0,3))
print(s1)
s2 = pd.Series(np.arange(5,8))
print(s2)
pd.concat([s1, s2])

0    0
1    1
2    2
dtype: int32
0    5
1    6
2    7
dtype: int32


0    0
1    1
2    2
0    5
1    6
2    7
dtype: int32

In [9]:
#concat 2 DataFrame
df1 = pd.DataFrame(np.arange(9).reshape(3,3),
                   columns=['a','b','c'])
print(df1)
df2 = pd.DataFrame(np.arange(9,18).reshape(3,3),
                   columns=['a','b','c'])
print(df2)
pd.concat([df1, df2]) #default function: rows being appended in order

   a  b  c
0  0  1  2
1  3  4  5
2  6  7  8
    a   b   c
0   9  10  11
1  12  13  14
2  15  16  17


    a   b   c
0   0   1   2
1   3   4   5
2   6   7   8
0   9  10  11
1  12  13  14
2  15  16  17

In [23]:
# Pandas will insert NaN values if a column in the result doesnot exist 
# in the DataFrame object currently being processed
df1 = pd.DataFrame(np.arange(9).reshape(3,3),
                   columns=['a','b','c'])
print(df1)
df2 = pd.DataFrame(np.arange(9,18).reshape(3,3),
                   columns=['a','b','d'])
print(df2)
pd.concat([df1, df2]) #NaN's will be filled in for the d column for df1 and c column for df2

   a  b  c
0  0  1  2
1  3  4  5
2  6  7  8
    a   b   d
0   9  10  11
1  12  13  14
2  15  16  17


    a   b    c     d
0   0   1  2.0   NaN
1   3   4  5.0   NaN
2   6   7  8.0   NaN
0   9  10  NaN  11.0
1  12  13  NaN  14.0
2  15  16  NaN  17.0

In [12]:
print(df1)
print('\n')
print(df2)
print('\n')
c = pd.concat([df1, df2], keys=['df1','df2'])
print(c)
print('\n')
print(c.loc['df2']) # extract the data originating from the fist ('df1') or second ('df2')
print('\n')
print(c.loc['df2'].loc[2]) #access index 'df2' then access index '2' (hierarchical index)


   a  b  c
0  0  1  2
1  3  4  5
2  6  7  8


    a   b   c
0   9  10  11
1  12  13  14
2  15  16  17


        a   b   c
df1 0   0   1   2
    1   3   4   5
    2   6   7   8
df2 0   9  10  11
    1  12  13  14
    2  15  16  17


    a   b   c
0   9  10  11
1  12  13  14
2  15  16  17


a    15
b    16
c    17
Name: 2, dtype: int32


In [14]:
# Switching axes of alignment
    # concat to row not column as above 
    # => can cause duplicate columns: (two column 'a' and two column 'c')
pd.concat([df1, df2], axis = 1) 
df3 = pd.DataFrame(np.arange(20, 26).reshape(3,2),
                   columns=['a','b'],
                   index=[2, 3, 4])
    # concat them. Alignment is along row labels
    # columns first from df1 and then df3, with duplicates.
    # NaN filled in where those columns do not exist in the source
pd.concat([df1, df3], axis=1)

     a    b    c     a     b
0  0.0  1.0  2.0   NaN   NaN
1  3.0  4.0  5.0   NaN   NaN
2  6.0  7.0  8.0  20.0  21.0
3  NaN  NaN  NaN  22.0  23.0
4  NaN  NaN  NaN  24.0  25.0

In [16]:
"""
A default concatenation actually performs an outer join operation along the index labels on
the axis opposite of the concatenation (the rows index). This makes the resulting set of
labels similar to having performed a union of those labels.

The type of join can be changed to an inner join by specifying "join" = "inner" as a
parameter. The inner join then logically performs an intersection of labels instead of a
union. The following demonstrates this and results in a single row, because  is the only
row index label in common:
"""

#do an inner joint instead of outer join as default, result in one row
print(df1)
print('\n')
print(df3)
pd.concat([df1, df3], axis=1, join='inner') #only row index '2' is in common, remove all NaN

   a  b  c
0  0  1  2
1  3  4  5
2  6  7  8


    a   b
2  20  21
3  22  23
4  24  25


   a  b  c   a   b
2  6  7  8  20  21

In [21]:
# label group of data along the columns using keys parameter
df = pd.concat([df1, df2],
               axis=1,
               keys=['df1','df2']) # 'df1' and 'df2' are column name/ key hirerache
df

  df1       df2        
    a  b  c   a   b   c
0   0  1  2   9  10  11
1   3  4  5  12  13  14
2   6  7  8  15  16  17

In [22]:
 #Chon tat ca cac hang; cot co key 'df2'
df.loc[:, 'df2']

    a   b   c
0   9  10  11
1  12  13  14
2  15  16  17

# Appending versus concatenation

In [24]:
#append does a concatenate along axis = 0; 
#duplicate row index labels can results
print(df1)
print(df2)
df1.append(df2)

   a  b  c
0  0  1  2
1  3  4  5
2  6  7  8
    a   b   d
0   9  10  11
1  12  13  14
2  15  16  17


    a   b    c     d
0   0   1  2.0   NaN
1   3   4  5.0   NaN
2   6   7  8.0   NaN
0   9  10  NaN  11.0
1  12  13  NaN  14.0
2  15  16  NaN  17.0

# Ignoring the index labels

In [25]:
df1.append(df2, ignore_index=True)

    a   b    c     d
0   0   1  2.0   NaN
1   3   4  5.0   NaN
2   6   7  8.0   NaN
3   9  10  NaN  11.0
4  12  13  NaN  14.0
5  15  16  NaN  17.0

# Merging and joining data

In [26]:
# MERGE

In [27]:
# these are our customers
customers = {'CustomerID': [10, 11],
             'Name': ['Mike', 'Marcia'],
             'Address': ['Address for Mike',
                         'Address for Marcia']}
customers

{'CustomerID': [10, 11],
 'Name': ['Mike', 'Marcia'],
 'Address': ['Address for Mike', 'Address for Marcia']}

In [28]:
customers = pd.DataFrame(customers)
customers

   CustomerID    Name             Address
0          10    Mike    Address for Mike
1          11  Marcia  Address for Marcia

In [29]:
 # and these are the orders made by our customers; they are related to customers by CustomerID
orders = {'CustomerID': [10, 11, 10],
          'OrderDate': [date(2014, 12, 1),
                        date(2014, 12, 1),
                        date(2014, 12, 1)]}
orders = pd.DataFrame(orders)
orders

   CustomerID   OrderDate
0          10  2014-12-01
1          11  2014-12-01
2          10  2014-12-01

In [30]:
# merge customers and orders so we can ship the items
customers.merge(orders)

   CustomerID    Name             Address   OrderDate
0          10    Mike    Address for Mike  2014-12-01
1          10    Mike    Address for Mike  2014-12-01
2          11  Marcia  Address for Marcia  2014-12-01

In [31]:
""" Under the hood of merge:
1. It determines the columns in both DVTUPNFST and PSEFST with common labels. 
These columns are treated as the keys to perform the join.
2. It creates a new %BUB'SBNF, whose columns are the labels from the keys
identified in step 1, followed by all the non-key labels from both the objects.
3. It matches values in the key columns of both %BUB'SBNF objects.
4. It then creates a row in the result for each set of matching labels.
5. It then copies the data from those matching rows from each source object into
that respective row and columns of the result.
6. It assigns a new *OU*OEFY to the result.
"""

In [32]:
#Merging and joining data _ JOIN
# data to be used in the remainder of this section's examples
left_data = {'key1': ['a', 'b', 'c'],
            'key2': ['x', 'y', 'z'],
            'lval1': [ 0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
              'key2': ['x', 'a', 'z'],
              'rval1': [ 6, 7, 8 ]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
print(left)
print(right)

  key1 key2  lval1
0    a    x      0
1    b    y      1
2    c    z      2
  key1 key2  rval1
1    a    x      6
2    b    a      7
3    c    z      8


In [33]:
# demonstrate merge without specifying columns to merge; 
    #this will implicitly merge on all common columns
#left and right have the same a - x and c - z row. 
    #so, merge will combine the row have same a - x and c - z in two DataFramesematics
left.merge(right) 

  key1 key2  lval1  rval1
0    a    x      0      6
1    c    z      2      8

In [34]:
# demonstrate merge using an explicit column; 
    # on needs the value to be in both DataFrame objects: column of 'key1'
left.merge(right, on='key1') #in here key1 will be the 'key point' to merge

  key1 key2_x  lval1 key2_y  rval1
0    a      x      0      x      6
1    b      y      1      a      7
2    c      z      2      z      8

In [35]:
 # merge explicitly using two columns
    # #the same element in key1 and key2 are the key point to merge
left.merge(right, on=['key1', 'key2']) 

  key1 key2  lval1  rval1
0    a    x      0      6
1    c    z      2      8

In [36]:
# join on the row indices of both matrices
    # left and right have the same index 1, 2 so they are key point to merge
pd.merge(left, right, left_index=True, right_index=True) 

  key1_x key2_x  lval1 key1_y key2_y  rval1
1      b      y      1      a      x      6
2      c      z      2      b      a      7

# Specifying the join semantics (ngữ nghĩa học) of a merge operation

In [None]:
"""
The default type of join performed by pd.merge() is an inner join. 
To use another join method, specify the join type using the how parameter of the pd.merge()
function (or the .merge() method). The valid options are:
    inner: This is the intersection of keys from both DataFrame objects
    outer: This is the union of keys from both DataFrame objects
    left: This only uses keys from the left DataFrame
    right: This only uses keys from the right DataFrame
"""

In [37]:

    # NaN filled in the unmatched  portion. The following code demonstrates an outer join:
    # outer join, merges all matched data of right to left, 
        # and fills unmatched items with NaN (ko tinh index)
left.merge(right, how='outer') 

  key1 key2  lval1  rval1
0    a    x    0.0    6.0
1    b    y    1.0    NaN
2    c    z    2.0    8.0
3    b    a    NaN    7.0

# Pivoting data to and from value and indexes

In [40]:

""" lua chon data tu value trong mot cot; vi du 1 cot co value X, Y, Z. dung .pivot() de xep lai cac value; pivot a level of column labels to row index (bien ca gia tri trong cot interval thanh row index ) """

sensor_readings = pd.read_csv("accel.csv")
sensor_readings

    interval axis  reading
0          0    X      0.0
1          0    Y      0.5
2          0    Z      1.0
3          1    X      0.1
4          1    Y      0.4
..       ...  ...      ...
7          2    Y      0.3
8          2    Z      0.8
9          3    X      0.3
10         3    Y      0.2
11         3    Z      0.7

[12 rows x 3 columns]

In [None]:
"""
what if you want to know the values for all axes at a given time and not
just the Y axis. To do this, you can perform a selection for each value of the axis, but that is
repetitive code and does not handle the scenario of new axis values being inserted into
DataFrame without a change to the code.
"""

In [41]:
# To convert to this form, use the DataFrame objects' pivot function:
sensor_readings.pivot(index='interval', 
        #dua cac gia tri trong interval thanh index (0, 1, 2...)
                      columns='axis', 
        #lay cac gia tri (X, Y, Z, X, Y, Z,...) trong cot 'axis' thanh 3 colum label (X, Y, Z)
                      values='reading',) 
        #doc cac gia tri trong cot 'reading' thanh value cho 3 cot (X, Y, Z)

axis        X    Y    Z
interval               
0         0.0  0.5  1.0
1         0.1  0.4  0.9
2         0.2  0.3  0.8
3         0.3  0.2  0.7

# Stacking and unstacking

In [None]:
"""
 The process of stacking pivots (quay, xoay truc) a level of column labels to the row index. 
 Unstacking performs the opposite, that is, 
 pivoting a level of the row index into the column index.
"""

In [None]:
"""
One of the differences between stacking/unstacking and performing a pivot is that unlike
pivots, the stack and unstack functions are able to pivot specific levels of a hierarchical
index

where a pivot retains the same number of levels on an index, a stack and
unstack always increases the levels on the index of one of the axes (columns for unstacking
and rows for stacking) and decrease the levels on the other axis
"""

In [42]:
#stacking using non-hierarchical indexes
df = pd.DataFrame({'a': [1,2],
                   'b' : [3, 4]},
                  index={'one', 'two'})
df

     a  b
two  1  3
one  2  4

In [43]:
stacked1 = df.stack() 
#push the column (a) to another level of index; the result is a Series where 
    #value are looked up through a multi-index
stacked1

two  a    1
     b    3
one  a    2
     b    4
dtype: int64

In [44]:
stacked1[('one','a')] #to access mulitple-index we have to pass a tuple
stacked1[('one','b')]

4

In [None]:
#unstacking using hierarchical indexes
"""
Unstacking will perform a similar operation in the opposite direction, by moving a level of
the row index into a level of the column's axis
"""

In [47]:
# make two copies of the sensor data, one for each user
user1 = sensor_readings.copy()
print(user1)
user2 = sensor_readings.copy()
print(user2)

    interval axis  reading
0          0    X      0.0
1          0    Y      0.5
2          0    Z      1.0
3          1    X      0.1
4          1    Y      0.4
..       ...  ...      ...
7          2    Y      0.3
8          2    Z      0.8
9          3    X      0.3
10         3    Y      0.2
11         3    Z      0.7

[12 rows x 3 columns]
    interval axis  reading
0          0    X      0.0
1          0    Y      0.5
2          0    Z      1.0
3          1    X      0.1
4          1    Y      0.4
..       ...  ...      ...
7          2    Y      0.3
8          2    Z      0.8
9          3    X      0.3
10         3    Y      0.2
11         3    Z      0.7

[12 rows x 3 columns]


In [54]:
# add names to the two copies
user1['who'] = 'Mickey'
user2['who'] = 'TOM'
print(user1)
print(user2)

    interval axis  reading     who
0          0    X      0.0  Mickey
1          0    Y      0.5  Mickey
2          0    Z      1.0  Mickey
3          1    X      0.1  Mickey
4          1    Y      0.4  Mickey
..       ...  ...      ...     ...
7          2    Y      0.3  Mickey
8          2    Z      0.8  Mickey
9          3    X      0.3  Mickey
10         3    Y      0.2  Mickey
11         3    Z      0.7  Mickey

[12 rows x 4 columns]
    interval axis  reading  who
0          0    X      0.0  TOM
1          0    Y     50.0  TOM
2          0    Z    100.0  TOM
3          1    X     10.0  TOM
4          1    Y     40.0  TOM
..       ...  ...      ...  ...
7          2    Y     30.0  TOM
8          2    Z     80.0  TOM
9          3    X     30.0  TOM
10         3    Y     20.0  TOM
11         3    Z     70.0  TOM

[12 rows x 4 columns]


In [55]:
# for demonstration, lets scale user2's readings
user2['reading'] *= 100

In [57]:
 # and reorganize this to have a hierarchical row index
multi_user_sensor_data = pd.concat([user1, user2]).set_index(['who', 'interval', 'axis'])
multi_user_sensor_data

                      reading
who    interval axis         
Mickey 0        X         0.0
                Y         0.5
                Z         1.0
       1        X         0.1
                Y         0.4
...                       ...
TOM    2        Y      3000.0
                Z      8000.0
       3        X      3000.0
                Y      2000.0
                Z      7000.0

[24 rows x 1 columns]

In [60]:
# lookup user data for Mike using just the index
multi_user_sensor_data.loc['Mickey']

               reading
interval axis         
0        X         0.0
         Y         0.5
         Z         1.0
1        X         0.1
         Y         0.4
...                ...
2        Y         0.3
         Z         0.8
3        X         0.3
         Y         0.2
         Z         0.7

[12 rows x 1 columns]

In [61]:
# readings for all users and axes at interval 1
multi_user_sensor_data.xs(1, level='interval')

             reading
who    axis         
Mickey X         0.1
       Y         0.4
       Z         0.9
TOM    X      1000.0
       Y      4000.0
       Z      9000.0

In [63]:
"""Unstacking will move the last level of the row index into a new level of the column index,
resulting in columns having MultiIndex. The following demonstrates the last level of this
unstacking (the axis level of the index)"""

In [64]:
# unstack the axis level (latest level of the index)
multi_user_sensor_data.unstack()

                reading                 
axis                  X       Y        Z
who    interval                         
Mickey 0            0.0     0.5      1.0
       1            0.1     0.4      0.9
       2            0.2     0.3      0.8
       3            0.3     0.2      0.7
TOM    0            0.0  5000.0  10000.0
       1         1000.0  4000.0   9000.0
       2         2000.0  3000.0   8000.0
       3         3000.0  2000.0   7000.0

In [65]:
# To unstack a different level, use the level parameter.
    # The following code unstacks the first level (level = 0)
multi_user_sensor_data.unstack(level=0) # column 'who': level 0 become row

              reading         
who            Mickey      TOM
interval axis                 
0        X        0.0      0.0
         Y        0.5   5000.0
         Z        1.0  10000.0
1        X        0.1   1000.0
         Y        0.4   4000.0
...               ...      ...
2        Y        0.3   3000.0
         Z        0.8   8000.0
3        X        0.3   3000.0
         Y        0.2   2000.0
         Z        0.7   7000.0

[12 rows x 2 columns]

In [67]:
# Multiple levels can be unstacked simultaneously by passing a list of the levels to .unstack()
    # Unstack level 0 'who' and level 2 'axis'
unstacked = multi_user_sensor_data.unstack(['who', 'axis']) 
unstacked

         reading                                   
who       Mickey               TOM                 
axis           X    Y    Z       X       Y        Z
interval                                           
0            0.0  0.5  1.0     0.0  5000.0  10000.0
1            0.1  0.4  0.9  1000.0  4000.0   9000.0
2            0.2  0.3  0.8  2000.0  3000.0   8000.0
3            0.3  0.2  0.7  3000.0  2000.0   7000.0

In [68]:
#  we can restack this data. 
#The following code will stack the who level of the column back into the row index:
unstacked.stack(level='who') #we can of course stack what we have unstacked this re-stacks who

                reading                 
axis                  X       Y        Z
interval who                            
0        Mickey     0.0     0.5      1.0
         TOM        0.0  5000.0  10000.0
1        Mickey     0.1     0.4      0.9
         TOM     1000.0  4000.0   9000.0
2        Mickey     0.2     0.3      0.8
         TOM     2000.0  3000.0   8000.0
3        Mickey     0.3     0.2      0.7
         TOM     3000.0  2000.0   7000.0

In [None]:
"""
Note:
    stacking and unstacking always move the levels into the last levels of the other index 
    => unstack may don't make the result come back the same as the origin
    pivit, stack, unstack doesn't change data, it just is reorganized
"""

# Melting data to and from long and wide format

In [None]:
"""
melting is the process of reshaping a DataFrame object into a format
where two or more columns, referred to as variable and value, 
are created by unpivoting column labels in the variable column, and then moving the data from these
columns into the appropriate location in the value column

changing a DataFrame object
from wide format to long format
"""

In [69]:
# we will demonstrate melting with this DataFrame
data = pd.DataFrame({'Name' : ['Mike', 'Mikael'],
                     'Height' : [6.1, 6.0],
                     'Weight' : [220, 185]})
data

     Name  Height  Weight
0    Mike     6.1     220
1  Mikael     6.0     185

In [70]:
# melt it, use Name as the id's,
    # Height and Weight columns as the variables
pd.melt(data,
        # Using Name column as identifier column
        id_vars=['Name'],
        # Height and Weight columns as measured variables; 
        # Height and Weight column un-pivoted into the variable column
        value_vars=['Height', 'Weight'])   


     Name variable  value
0    Mike   Height    6.1
1  Mikael   Height    6.0
2    Mike   Weight  220.0
3  Mikael   Weight  185.0

# Performance benefits of stacked data

In [None]:
"""It can be shown that stacked
data is more efficient than using lookup through a single level index and then a column
lookup, or even compared to an JMPD lookup that specifies the row and column by
location"""
