## Combining Pandas Objects

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 7,'display.expand_frame_repr', True, # 'max_rows', 10, 
    'max_colwidth', 9, 'max_rows', 10, #'precision', 2
)#, 'width', 45)
pd.set_option('display.width', 65)

## Introduction

## Appending new rows to DataFrames

### How to do it...

In [2]:
names = pd.read_csv('data/names.csv')
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [3]:
new_data_list = ['Aria', 1]
names.loc[4] = new_data_list
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [4]:
names.loc['five'] = ['Zach', 3]
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3


In [5]:
names.loc[len(names)] = {'Name':'Zayd', 'Age':2}
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2


In [6]:
names.loc[len(names)] = pd.Series({'Age':32, 'Name':'Dean'})
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2
7,Dean,32


In [7]:
names = pd.read_csv('data/names.csv')
names.append({'Name':'Aria', 'Age':1})

TypeError: Can only append a Series if ignore_index=True or if the Series has a name

In [8]:
names.append({'Name':'Aria', 'Age':1}, ignore_index=True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [9]:
names.index = ['Canada', 'Canada', 'USA', 'USA']
names

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2


In [10]:
s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s

Name    Zach
Age        3
Name: 4, dtype: object

In [11]:
names.append(s)

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3


In [12]:
s1 = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s2 = pd.Series({'Name': 'Zayd', 'Age': 2}, name='USA')
names.append([s1, s2])

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3
USA,Zayd,2


In [13]:
bball_16 = pd.read_csv('data/baseball16.csv')
bball_16

Unnamed: 0,playerID,yearID,stint,...,SH,SF,GIDP
0,altuv...,2016,1,...,3.0,7.0,15.0
1,bregm...,2016,1,...,0.0,1.0,1.0
2,castr...,2016,1,...,1.0,0.0,9.0
3,corre...,2016,1,...,0.0,3.0,12.0
4,gatti...,2016,1,...,0.0,5.0,12.0
...,...,...,...,...,...,...,...
11,reedaj01,2016,1,...,0.0,1.0,1.0
12,sprin...,2016,1,...,0.0,1.0,12.0
13,tucke...,2016,1,...,0.0,0.0,2.0
14,valbu...,2016,1,...,3.0,2.0,5.0


In [14]:
data_dict = bball_16.iloc[0].to_dict()
data_dict

{'playerID': 'altuvjo01',
 'yearID': 2016,
 'stint': 1,
 'teamID': 'HOU',
 'lgID': 'AL',
 'G': 161,
 'AB': 640,
 'R': 108,
 'H': 216,
 '2B': 42,
 '3B': 5,
 'HR': 24,
 'RBI': 96.0,
 'SB': 30.0,
 'CS': 10.0,
 'BB': 60,
 'SO': 70.0,
 'IBB': 11.0,
 'HBP': 7.0,
 'SH': 3.0,
 'SF': 7.0,
 'GIDP': 15.0}

In [15]:
new_data_dict = {k: '' if isinstance(v, str) else
    np.nan for k, v in data_dict.items()}
new_data_dict

{'playerID': '',
 'yearID': nan,
 'stint': nan,
 'teamID': '',
 'lgID': '',
 'G': nan,
 'AB': nan,
 'R': nan,
 'H': nan,
 '2B': nan,
 '3B': nan,
 'HR': nan,
 'RBI': nan,
 'SB': nan,
 'CS': nan,
 'BB': nan,
 'SO': nan,
 'IBB': nan,
 'HBP': nan,
 'SH': nan,
 'SF': nan,
 'GIDP': nan}

### How it works...

### There's more...

In [16]:
random_data = []
for i in range(1000):   # doctest: +SKIP
    d = dict()
    for k, v in data_dict.items():
        if isinstance(v, str):
            d[k] = np.random.choice(list('abcde'))
        else:
            d[k] = np.random.randint(10)
    random_data.append(pd.Series(d, name=i + len(bball_16)))
random_data[0]

playerID    a
yearID      9
stint       0
teamID      d
lgID        a
           ..
IBB         4
HBP         8
SH          2
SF          8
GIDP        5
Name: 16, Length: 22, dtype: object

## Concatenating multiple DataFrames together

### How to do it...

In [17]:
stocks_2016 = pd.read_csv('data/stocks_2016.csv',
    index_col='Symbol')
stocks_2017 = pd.read_csv('data/stocks_2017.csv',
    index_col='Symbol')

In [18]:
stocks_2016

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70


In [19]:
stocks_2017

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [20]:
s_list = [stocks_2016, stocks_2017]
pd.concat(s_list)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [21]:
pd.concat(s_list, keys=['2016', '2017'],
   names=['Year', 'Symbol'])  

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Year,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300


In [22]:
pd.concat(s_list, keys=['2016', '2017'],
    axis='columns', names=['Year', None])    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [23]:
pd.concat(s_list, join='inner', keys=['2016', '2017'],
    axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


### How it works...

### There's more...

In [24]:
stocks_2016.append(stocks_2017)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


## Understanding the differences between concat, join, and merge

### How to do it...

In [25]:
from IPython.display import display_html
years = 2016, 2017, 2018
stock_tables = [pd.read_csv(
    'data/stocks_{}.csv'.format(year), index_col='Symbol')
    for year in years]
stocks_2016, stocks_2017, stocks_2018 = stock_tables
stocks_2016

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70


In [26]:
stocks_2017

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [27]:
stocks_2018

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,40,135,170
AMZN,8,900,1125
TSLA,50,220,400


In [28]:
pd.concat(stock_tables, keys=[2016, 2017, 2018])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Unnamed: 0_level_1,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,...,...,...,...
2017,TXN,500,15,23
2017,TSLA,100,100,300
2018,AAPL,40,135,170
2018,AMZN,8,900,1125


In [29]:
pd.concat(dict(zip(years, stock_tables)), axis='columns')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,2016,2016,2016,...,2018,2018,2018
Unnamed: 0_level_1,Shares,Low,High,...,Shares,Low,High
AAPL,80.0,95.0,110.0,...,40.0,135.0,170.0
AMZN,,,,...,8.0,900.0,1125.0
GE,,,,...,,,
IBM,,,,...,,,
SLB,,,,...,,,
TSLA,50.0,80.0,130.0,...,50.0,220.0,400.0
TXN,,,,...,,,
WMT,40.0,55.0,70.0,...,,,


In [30]:
stocks_2016.join(stocks_2017, lsuffix='_2016',
    rsuffix='_2017', how='outer')

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [31]:
other = [stocks_2017.add_suffix('_2017'),
    stocks_2018.add_suffix('_2018')]
stocks_2016.add_suffix('_2016').join(other, how='outer')

Unnamed: 0,Shares_2016,Low_2016,High_2016,...,Shares_2018,Low_2018,High_2018
AAPL,80.0,95.0,110.0,...,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,...,50.0,220.0,400.0
WMT,40.0,55.0,70.0,...,,,
GE,,,,...,,,
IBM,,,,...,,,
SLB,,,,...,,,
TXN,,,,...,,,
AMZN,,,,...,8.0,900.0,1125.0


In [32]:
stock_join = stocks_2016.add_suffix('_2016').join(other,
    how='outer')
stock_concat = pd.concat(dict(zip(years,stock_tables)),
    axis='columns')
level_1 = stock_concat.columns.get_level_values(1)
level_0 = stock_concat.columns.get_level_values(0).astype(str)
stock_concat.columns = level_1 + '_' + level_0
stock_join.equals(stock_concat)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


False

In [33]:
stocks_2016.merge(stocks_2017, left_index=True,
    right_index=True)

Unnamed: 0_level_0,Shares_x,Low_x,High_x,Shares_y,Low_y,High_y
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


In [34]:
step1 = stocks_2016.merge(stocks_2017, left_index=True,
    right_index=True, how='outer',
    suffixes=('_2016', '_2017'))
stock_merge = step1.merge(stocks_2018.add_suffix('_2018'),
    left_index=True, right_index=True,
    how='outer')
stock_concat.equals(stock_merge)

True

In [35]:
names = ['prices', 'transactions']
food_tables = [pd.read_csv('data/food_{}.csv'.format(name))
    for name in names]
food_prices, food_transactions = food_tables
food_prices

Unnamed: 0,item,store,price,Date
0,pear,A,0.99,2017
1,pear,B,1.99,2017
2,peach,A,2.99,2017
3,peach,B,3.49,2017
4,banana,A,0.39,2017
5,banana,B,0.49,2017
6,steak,A,5.99,2017
7,steak,B,6.99,2017
8,steak,B,4.99,2015


In [36]:
food_transactions

Unnamed: 0,custid,item,store,quantity
0,1,pear,A,5
1,1,banana,A,10
2,2,steak,B,3
3,2,pear,B,1
4,2,peach,B,2
5,2,steak,B,1
6,2,coconut,B,4


In [37]:
food_transactions.merge(food_prices, on=['item', 'store'])    

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017
1,1,banana,A,10,0.39,2017
2,2,steak,B,3,6.99,2017
3,2,steak,B,3,4.99,2015
4,2,steak,B,1,6.99,2017
5,2,steak,B,1,4.99,2015
6,2,pear,B,1,1.99,2017
7,2,peach,B,2,3.49,2017


In [38]:
food_transactions.merge(food_prices.query('Date == 2017'),
    how='left')

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [39]:
food_prices_join = food_prices.query('Date == 2017') \
   .set_index(['item', 'store'])
food_prices_join    

Unnamed: 0_level_0,Unnamed: 1_level_0,price,Date
item,store,Unnamed: 2_level_1,Unnamed: 3_level_1
pear,A,0.99,2017
pear,B,1.99,2017
peach,A,2.99,2017
peach,B,3.49,2017
banana,A,0.39,2017
banana,B,0.49,2017
steak,A,5.99,2017
steak,B,6.99,2017


In [40]:
food_transactions.join(food_prices_join, on=['item', 'store'])

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [41]:
pd.concat([food_transactions.set_index(['item', 'store']),
           food_prices.set_index(['item', 'store'])],
          axis='columns')

ValueError: cannot handle a non-unique multi-index!

### How it works...

### There's more...

In [42]:
import glob
df_list = []
for filename in glob.glob('data/gas prices/*.csv'):
    df_list.append(pd.read_csv(filename, index_col='Week',
    parse_dates=['Week']))
gas = pd.concat(df_list, axis='columns')
gas

Unnamed: 0_level_0,Midgrade,Premium,Diesel,All Grades,Regular
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-25,2.859,3.105,2.788,2.701,2.583
2017-09-18,2.906,3.151,2.791,2.750,2.634
2017-09-11,2.953,3.197,2.802,2.800,2.685
2017-09-04,2.946,3.191,2.758,2.794,2.679
2017-08-28,2.668,2.901,2.605,2.513,2.399
...,...,...,...,...,...
2007-01-29,2.277,2.381,2.413,2.213,2.165
2007-01-22,2.285,2.391,2.430,2.216,2.165
2007-01-15,2.347,2.453,2.463,2.280,2.229
2007-01-08,2.418,2.523,2.537,2.354,2.306


## Connecting to SQL databases

### How to do it...

In [43]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/chinook.db')

In [44]:
tracks = pd.read_sql_table('tracks', engine)
tracks

Unnamed: 0,TrackId,Name,AlbumId,...,Milliseconds,Bytes,UnitPrice
0,1,For T...,1,...,343719,11170334,0.99
1,2,Balls...,2,...,342562,5510424,0.99
2,3,Fast ...,3,...,230619,3990994,0.99
3,4,Restl...,3,...,252051,4331779,0.99
4,5,Princ...,3,...,375418,6290521,0.99
...,...,...,...,...,...,...,...
3498,3499,Pini ...,343,...,286741,4718950,0.99
3499,3500,Strin...,344,...,139200,2283131,0.99
3500,3501,L'orf...,345,...,66639,1189062,0.99
3501,3502,Quint...,346,...,221331,3665114,0.99


In [45]:
(pd.read_sql_table('genres', engine)
     .merge(tracks[['GenreId', 'Milliseconds']],
            on='GenreId', how='left') 
     .drop('GenreId', axis='columns')
)

Unnamed: 0,Name,Milliseconds
0,Rock,343719
1,Rock,342562
2,Rock,230619
3,Rock,252051
4,Rock,375418
...,...,...
3498,Class...,286741
3499,Class...,139200
3500,Class...,66639
3501,Class...,221331


In [46]:
(pd.read_sql_table('genres', engine)
     .merge(tracks[['GenreId', 'Milliseconds']],
            on='GenreId', how='left') 
     .drop('GenreId', axis='columns')
     .groupby('Name')
     ['Milliseconds']
     .mean()
     .pipe(lambda s_: pd.to_timedelta(s_, unit='ms'))
     .dt.floor('s')
     .sort_values()
)

Name
Rock And Roll      00:02:14
Opera              00:02:54
Hip Hop/Rap        00:02:58
Easy Listening     00:03:09
Bossa Nova         00:03:39
                     ...   
Comedy             00:26:25
TV Shows           00:35:45
Drama              00:42:55
Science Fiction    00:43:45
Sci Fi & Fantasy   00:48:31
Name: Milliseconds, Length: 25, dtype: timedelta64[ns]

In [47]:
cust = pd.read_sql_table('customers', engine,
    columns=['CustomerId','FirstName',
    'LastName'])
invoice = pd.read_sql_table('invoices', engine,
    columns=['InvoiceId','CustomerId'])
ii = pd.read_sql_table('invoice_items', engine,
    columns=['InvoiceId', 'UnitPrice', 'Quantity'])
(cust
    .merge(invoice, on='CustomerId') 
    .merge(ii, on='InvoiceId')
)

Unnamed: 0,CustomerId,FirstName,LastName,InvoiceId,UnitPrice,Quantity
0,1,Luís,Gonça...,98,1.99,1
1,1,Luís,Gonça...,98,1.99,1
2,1,Luís,Gonça...,121,0.99,1
3,1,Luís,Gonça...,121,0.99,1
4,1,Luís,Gonça...,121,0.99,1
...,...,...,...,...,...,...
2235,59,Puja,Sriva...,284,0.99,1
2236,59,Puja,Sriva...,284,0.99,1
2237,59,Puja,Sriva...,284,0.99,1
2238,59,Puja,Sriva...,284,0.99,1


In [48]:
(cust
    .merge(invoice, on='CustomerId') 
    .merge(ii, on='InvoiceId')
    .assign(Total=lambda df_:df_.Quantity * df_.UnitPrice)
    .groupby(['CustomerId', 'FirstName', 'LastName'])
    ['Total']
    .sum()
    .sort_values(ascending=False) 
)

CustomerId  FirstName  LastName  
6           Helena     Holý          49.62
26          Richard    Cunningham    47.62
57          Luis       Rojas         46.62
46          Hugh       O'Reilly      45.62
45          Ladislav   Kovács        45.62
                                     ...  
32          Aaron      Mitchell      37.62
31          Martha     Silk          37.62
29          Robert     Brown         37.62
27          Patrick    Gray          37.62
59          Puja       Srivastava    36.64
Name: Total, Length: 59, dtype: float64

### How it works...

### There's more...

In [49]:
sql_string1 = '''
SELECT
    Name,
    time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time
FROM (
      SELECT
          g.Name,
          t.Milliseconds
      FROM
          genres as g
      JOIN
          tracks as t on
          g.genreid == t.genreid
     )
GROUP BY Name
ORDER BY avg_time'''
pd.read_sql_query(sql_string1, engine)

Unnamed: 0,Name,avg_time
0,Rock ...,00:02:14
1,Opera,00:02:54
2,Hip H...,00:02:58
3,Easy ...,00:03:09
4,Bossa...,00:03:39
...,...,...
20,Comedy,00:26:25
21,TV Shows,00:35:45
22,Drama,00:42:55
23,Scien...,00:43:45


In [50]:
sql_string2 = '''
   SELECT
         c.customerid,
         c.FirstName,
         c.LastName,
         sum(ii.quantity * ii.unitprice) as Total
   FROM
        customers as c
   JOIN
        invoices as i
        on c.customerid = i.customerid
   JOIN
       invoice_items as ii
       on i.invoiceid = ii.invoiceid
   GROUP BY
       c.customerid, c.FirstName, c.LastName
   ORDER BY
       Total desc'''

In [51]:
pd.read_sql_query(sql_string2, engine)

Unnamed: 0,CustomerId,FirstName,LastName,Total
0,6,Helena,Holý,49.62
1,26,Richard,Cunni...,47.62
2,57,Luis,Rojas,46.62
3,45,Ladislav,Kovács,45.62
4,46,Hugh,O'Reilly,45.62
...,...,...,...,...
54,53,Phil,Hughes,37.62
55,54,Steve,Murray,37.62
56,55,Mark,Taylor,37.62
57,56,Diego,Gutié...,37.62
