# Concatenating DataFrames

There are times where we will want to concatenate dataframes that share common row labels and/or common column names.

In [5]:
import pandas as pd
import numpy as np
from glob import glob

sales1 = pd.read_csv('./data/sales.csv', index_col='month')
sales2 = pd.read_csv('./data/sales4.csv', index_col='month')

# concat the dataframes on index/rows
pd.concat([sales1, sales2], sort=True)

Unnamed: 0_level_0,bread,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,,47,12.0,17
Feb,,110,50.0,31
Mar,,221,89.0,72
Apr,,77,87.0,20
May,,132,,52
Jun,,205,60.0,55
Jan,23.0,47,12.0,17
Feb,54.0,110,50.0,31
Apr,11.0,221,89.0,72
Aug,32.0,77,87.0,20


This results in indices being duplicated. in these cases we can create a hierarchical or multilevel index using the `keys` attribute, with the labels from the `keys` being used as the outer most index.

In [10]:
pd.concat([sales1, sales2], keys=[2017, 2018], sort=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,eggs,salt,spam
Unnamed: 0_level_1,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017,Jan,,47,12.0,17
2017,Feb,,110,50.0,31
2017,Mar,,221,89.0,72
2017,Apr,,77,87.0,20
2017,May,,132,,52
2017,Jun,,205,60.0,55
2018,Jan,23.0,47,12.0,17
2018,Feb,54.0,110,50.0,31
2018,Apr,11.0,221,89.0,72
2018,Aug,32.0,77,87.0,20


In [7]:
# concatenating the dataframes using 'axis=columns'
pd.concat([sales1, sales2], axis='columns', sort=True)

Unnamed: 0,eggs,salt,spam,eggs.1,salt.1,spam.1,bread
Apr,77.0,87.0,20.0,221.0,89.0,72.0,11.0
Aug,,,,77.0,87.0,20.0,32.0
Dec,,,,205.0,60.0,55.0,11.0
Feb,110.0,50.0,31.0,110.0,50.0,31.0,54.0
Jan,47.0,12.0,17.0,47.0,12.0,17.0,23.0
Jun,205.0,60.0,55.0,,,,
Mar,221.0,89.0,72.0,,,,
May,132.0,,52.0,,,,
Sep,,,,132.0,,52.0,55.0


Again, we'll use the we can use the `keys` attribute to create a multilevel index.

In [8]:
pd.concat([sales1, sales2], axis='columns', keys=[2017, 2018], sort=True)

Unnamed: 0_level_0,2017,2017,2017,2018,2018,2018,2018
Unnamed: 0_level_1,eggs,salt,spam,eggs,salt,spam,bread
Apr,77.0,87.0,20.0,221.0,89.0,72.0,11.0
Aug,,,,77.0,87.0,20.0,32.0
Dec,,,,205.0,60.0,55.0,11.0
Feb,110.0,50.0,31.0,110.0,50.0,31.0,54.0
Jan,47.0,12.0,17.0,47.0,12.0,17.0,23.0
Jun,205.0,60.0,55.0,,,,
Mar,221.0,89.0,72.0,,,,
May,132.0,,52.0,,,,
Sep,,,,132.0,,52.0,55.0


An alternative is to pass a dict to the `concat` method.

In [9]:
d_obj = {2017: sales1, 2018: sales2}
pd.concat(d_obj, axis='columns', sort=True)

Unnamed: 0_level_0,2017,2017,2017,2018,2018,2018,2018
Unnamed: 0_level_1,eggs,salt,spam,eggs,salt,spam,bread
Apr,77.0,87.0,20.0,221.0,89.0,72.0,11.0
Aug,,,,77.0,87.0,20.0,32.0
Dec,,,,205.0,60.0,55.0,11.0
Feb,110.0,50.0,31.0,110.0,50.0,31.0,54.0
Jan,47.0,12.0,17.0,47.0,12.0,17.0,23.0
Jun,205.0,60.0,55.0,,,,
Mar,221.0,89.0,72.0,,,,
May,132.0,,52.0,,,,
Sep,,,,132.0,,52.0,55.0


Concatenate the medals tables into a multilevel index dataframe.

In [11]:
file_names = glob('./data/Summer Olympic medals/*_top5.csv')
file_names

['./data/Summer Olympic medals/silver_top5.csv',
 './data/Summer Olympic medals/gold_top5.csv',
 './data/Summer Olympic medals/bronze_top5.csv']

In [19]:
medal_dfs = [pd.read_csv(f, index_col='Country') for f in file_names]
medals = pd.concat(medal_dfs, keys=['silver', 'gold', 'bronze'], axis='columns', sort=True)
medals = medals[['bronze', 'silver', 'gold']]
medals

Unnamed: 0_level_0,bronze,silver,gold
Unnamed: 0_level_1,Total,Total,Total
France,475.0,461.0,
Germany,454.0,,407.0
Italy,,394.0,460.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0
United States,1052.0,1195.0,2088.0


In [29]:
medals = pd.concat(medal_dfs, keys=['silver', 'gold', 'bronze'], sort=True)
medals.sort_index(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,France,475.0
bronze,Germany,454.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,United States,1052.0
gold,Germany,407.0
gold,Italy,460.0
gold,Soviet Union,838.0
gold,United Kingdom,498.0
gold,United States,2088.0


**Alternatively**

In [28]:
medals = []
medal_types = ['bronze', 'silver', 'gold']

for medal in medal_types:

    file_name = "./data/Summer Olympic medals/%s_top5.csv" % medal
    
    # Read file_name into a DataFrame: medal_df
    medal_df = pd.read_csv(file_name, index_col='Country')
    
    # Append medal_df to medals
    medals.append(medal_df)
    
# Concatenate medals: medals
medals = pd.concat(medals, keys=['bronze', 'silver', 'gold'])

# Print medals in entirety
print(medals)

                        Total
       Country               
bronze United States   1052.0
       Soviet Union     584.0
       United Kingdom   505.0
       France           475.0
       Germany          454.0
silver United States   1195.0
       Soviet Union     627.0
       United Kingdom   591.0
       France           461.0
       Italy            394.0
gold   United States   2088.0
       Soviet Union     838.0
       United Kingdom   498.0
       Italy            460.0
       Germany          407.0


To ensure the index is sorted, call `sort_index()` on the dataframe.

In [30]:
medals_sorted = medals.sort_index(level=0)
medals_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,France,475.0
bronze,Germany,454.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,United States,1052.0
gold,Germany,407.0
gold,Italy,460.0
gold,Soviet Union,838.0
gold,United Kingdom,498.0
gold,United States,2088.0


In [32]:
# Print the number of Bronze medals won by Germany
print(medals_sorted.loc[('bronze','Germany')])

Total    454.0
Name: (bronze, Germany), dtype: float64


In [33]:
# Print data about silver medals
print(medals_sorted.loc['silver'])

                 Total
Country               
France           461.0
Italy            394.0
Soviet Union     627.0
United Kingdom   591.0
United States   1195.0


 A slicer, `pd.IndexSlice` object is required when slicing on the inner level of a MultiIndex dataframe.

In [35]:
# Print all the data on medals won by the United Kingdom
medals_sorted.loc[pd.IndexSlice[:, 'United Kingdom'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United Kingdom,505.0
gold,United Kingdom,498.0
silver,United Kingdom,591.0


**Concatenating horizontally to get a multilevel index**

In [36]:
file_names = glob('./data/Sales/feb-*.csv')
file_names

['./data/Sales/feb-sales-Hardware.csv',
 './data/Sales/feb-sales-Software.csv',
 './data/Sales/feb-sales-Service.csv']

In [39]:
sales = [pd.read_csv(f, index_col='Date', parse_dates=True) for f in file_names]
february = pd.concat(sales, axis=1, keys=['Hardware', 'Software', 'Service'])
february

Unnamed: 0_level_0,Hardware,Hardware,Hardware,Software,Software,Software,Service,Service,Service
Unnamed: 0_level_1,Company,Product,Units,Company,Product,Units,Company,Product,Units
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2015-02-02 08:33:01,,,,Hooli,Software,3.0,,,
2015-02-02 20:54:49,Mediacore,Hardware,9.0,,,,,,
2015-02-03 14:14:18,,,,Initech,Software,13.0,,,
2015-02-04 15:36:29,,,,Streeplex,Software,13.0,,,
2015-02-04 21:52:45,Acme Coporation,Hardware,14.0,,,,,,
2015-02-05 01:53:06,,,,Acme Coporation,Software,19.0,,,
2015-02-05 22:05:03,,,,,,,Hooli,Service,10.0
2015-02-07 22:58:10,Acme Coporation,Hardware,1.0,,,,,,
2015-02-09 08:57:30,,,,,,,Streeplex,Service,19.0
2015-02-09 13:09:55,,,,Mediacore,Software,7.0,,,


In [41]:
# Print february.info()
print(february.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20 entries, 2015-02-02 08:33:01 to 2015-02-26 08:58:51
Data columns (total 9 columns):
(Hardware, Company)    5 non-null object
(Hardware, Product)    5 non-null object
(Hardware, Units)      5 non-null float64
(Software, Company)    9 non-null object
(Software, Product)    9 non-null object
(Software, Units)      9 non-null float64
(Service, Company)     6 non-null object
(Service, Product)     6 non-null object
(Service, Units)       6 non-null float64
dtypes: float64(3), object(6)
memory usage: 2.2+ KB
None


In [42]:
# Assign pd.IndexSlice: idx
idx = pd.IndexSlice

# Extract a slice called slice_2_8 from february (using .loc[] & idx) that comprises 
# rows between Feb. 2, 2015 to Feb. 8, 2015 from columns under 'Company'
february.loc['2015-02-02':'2015-02-08', idx[:, 'Company']]

Unnamed: 0_level_0,Hardware,Software,Service
Unnamed: 0_level_1,Company,Company,Company
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2015-02-02 08:33:01,,Hooli,
2015-02-02 20:54:49,Mediacore,,
2015-02-03 14:14:18,,Initech,
2015-02-04 15:36:29,,Streeplex,
2015-02-04 21:52:45,Acme Coporation,,
2015-02-05 01:53:06,,Acme Coporation,
2015-02-05 22:05:03,,,Hooli
2015-02-07 22:58:10,Acme Coporation,,


**Aggregate the sum of all sales over the 'Company' column into a single DataFrame**.

In [51]:
jan = pd.read_csv('./data/Sales/sales-jan-2015.csv', index_col='Date', parse_dates=True)
feb = pd.read_csv('./data/Sales/sales-feb-2015.csv', index_col='Date', parse_dates=True)
mar = pd.read_csv('./data/Sales/sales-mar-2015.csv', index_col='Date', parse_dates=True)

In [52]:
# Make the list of tuples: month_list
month_list = [('january', jan), ('february', feb), ('march', mar)]

# Create an empty dictionary: month_dict
month_dict = {}

for month_name, month_data in month_list:

    # Group month_data: month_dict[month_name]
    month_dict[month_name] = month_data.groupby('Company').sum()

# Concatenate data in month_dict: sales
pd.concat(month_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,Units
Unnamed: 0_level_1,Company,Unnamed: 2_level_1
february,Acme Coporation,34
february,Hooli,30
february,Initech,30
february,Mediacore,45
february,Streeplex,37
january,Acme Coporation,76
january,Hooli,70
january,Initech,37
january,Mediacore,15
january,Streeplex,50
