# Grouping your data


In [1]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

import matplotlib
matplotlib.rcParams['axes.grid'] = True # show gridlines by default
%matplotlib inline

import pandas as pd
import pandas_datareader as pdr

In [2]:
if pd.__version__.startswith('0.23'):
    # this solves an incompatibility between pandas 0.23 and datareader 0.6
    # taken from https://stackoverflow.com/questions/50394873/
    core.common.is_list_like = api.types.is_list_like

from pandas_datareader.wb import download

In [3]:
?download

In [4]:
YEAR = 2013
GDP_INDICATOR = 'NY.GDP.MKTP.CD'
gdp = download(indicator=GDP_INDICATOR, country=['GB','CN'],
start=YEAR-5, end=YEAR)
gdp = gdp.reset_index()
gdp

Unnamed: 0,country,year,NY.GDP.MKTP.CD
0,China,2013,9570406000000.0
1,China,2012,8532230000000.0
2,China,2011,7551500000000.0
3,China,2010,6087164000000.0
4,China,2009,5101703000000.0
5,China,2008,4594307000000.0
6,United Kingdom,2013,2783251000000.0
7,United Kingdom,2012,2704017000000.0
8,United Kingdom,2011,2659882000000.0
9,United Kingdom,2010,2481580000000.0


In [5]:
gdp.groupby('country')['NY.GDP.MKTP.CD'].aggregate(sum)

country
China             4.143731e+13
United Kingdom    1.596255e+13
Name: NY.GDP.MKTP.CD, dtype: float64

In [6]:
gdp.groupby('year')['NY.GDP.MKTP.CD'].aggregate(sum)

year
2008    7.515739e+12
2009    7.514093e+12
2010    8.568743e+12
2011    1.021138e+13
2012    1.123625e+13
2013    1.235366e+13
Name: NY.GDP.MKTP.CD, dtype: float64

In [7]:
LOCATION='comtrade_milk_uk_monthly_14.csv'

In [8]:
# LOCATION = 'http://comtrade.un.org/api/get?max=5000&type=C&freq=M&px=HS&ps=2014&r=826&p=all&rg=1%2C2&cc=0401%2C0402&fmt=csv'

In [9]:
milk = pd.read_csv(LOCATION, dtype={'Commodity Code':str, 'Reporter Code':str})
milk.head(3)

Unnamed: 0,Classification,Year,Period,Period Desc.,Aggregate Level,Is Leaf Code,Trade Flow Code,Trade Flow,Reporter Code,Reporter,...,Qty,Alt Qty Unit Code,Alt Qty Unit,Alt Qty,Netweight (kg),Gross weight (kg),Trade Value (US$),CIF Trade Value (US$),FOB Trade Value (US$),Flag
0,HS,2014,201401,January 2014,4,0,1,Imports,826,United Kingdom,...,,,,,22404316,,21950747,,,0
1,HS,2014,201401,January 2014,4,0,2,Exports,826,United Kingdom,...,,,,,60497363,,46923551,,,0
2,HS,2014,201401,January 2014,4,0,2,Exports,826,United Kingdom,...,,,,,2520,,3410,,,0


In [10]:
COLUMNS = ['Year', 'Period','Trade Flow','Reporter', 'Partner', 'Commodity','Commodity Code','Trade Value (US$)']
milk = milk[COLUMNS]

In [11]:
milk_world = milk[milk['Partner'] == 'World']
milk_countries = milk[milk['Partner'] != 'World']

In [12]:
milk_countries.to_csv('countrymilk.csv', index=False)

In [13]:
load_test = pd.read_csv('countrymilk.csv', dtype={'Commodity Code':str, 'Reporter Code':str})
load_test.head(2)

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
0,2014,201401,Exports,United Kingdom,Afghanistan,"Milk and cream, neither concentrated nor sweet...",401,3410
1,2014,201401,Exports,United Kingdom,Austria,"Milk and cream, neither concentrated nor sweet...",401,316


In [14]:
milk_imports = milk[milk['Trade Flow'] == 'Imports']
milk_countries_imports = milk_countries[milk_countries['Trade Flow'] == 'Imports']
milk_world_imports=milk_world[milk_world['Trade Flow'] == 'Imports']

In [15]:
milkImportsInJanuary2014 = milk_countries_imports[milk_countries_imports['Period'] == 201401]
milkImportsInJanuary2014.sort_values('Trade Value (US$)',ascending=False).head(10)

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
23,2014,201401,Imports,United Kingdom,Ireland,"Milk and cream, neither concentrated nor sweet...",401,10676138
626,2014,201401,Imports,United Kingdom,France,"Milk and cream, concentrated or sweetened",402,8020014
637,2014,201401,Imports,United Kingdom,Ireland,"Milk and cream, concentrated or sweetened",402,5966962
650,2014,201401,Imports,United Kingdom,Netherlands,"Milk and cream, concentrated or sweetened",402,4650774
629,2014,201401,Imports,United Kingdom,Germany,"Milk and cream, concentrated or sweetened",402,4545873
4,2014,201401,Imports,United Kingdom,Belgium,"Milk and cream, neither concentrated nor sweet...",401,4472349
612,2014,201401,Imports,United Kingdom,Belgium,"Milk and cream, concentrated or sweetened",402,3584038
10,2014,201401,Imports,United Kingdom,Denmark,"Milk and cream, neither concentrated nor sweet...",401,2233438
667,2014,201401,Imports,United Kingdom,Spain,"Milk and cream, concentrated or sweetened",402,1850097
15,2014,201401,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1522872


# Make sure you run all the cell above!

## Grouping data

On many occasions, a dataframe may be organised as groups of rows where the group membership is identified based on cell values within one or more 'key' columns. **Grouping** refers to the process whereby rows associated with a particular group are collated so that you can work with just those rows as distinct subsets of the whole dataset.

The number of groups the dataframe will be split into is based on the number of unique values identified within a single key column, or the number of unique combinations of values for two or more key columns.

The `groupby()` method runs down each row in a data frame, splitting the rows into separate groups based on the unique values associated with the key column or columns.

The following is an example of the steps and code needed to split a dataframe. 

### Grouping the data

Split the data into two different subsets of data (imports and exports), by grouping on trade flow.

In [16]:
groups = milk_countries.groupby('Trade Flow')

Inspect the first few rows associated with a particular group:

In [17]:
groups.get_group('Imports').head()

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
4,2014,201401,Imports,United Kingdom,Belgium,"Milk and cream, neither concentrated nor sweet...",401,4472349
10,2014,201401,Imports,United Kingdom,Denmark,"Milk and cream, neither concentrated nor sweet...",401,2233438
15,2014,201401,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1522872
17,2014,201401,Imports,United Kingdom,Germany,"Milk and cream, neither concentrated nor sweet...",401,1028700
23,2014,201401,Imports,United Kingdom,Ireland,"Milk and cream, neither concentrated nor sweet...",401,10676138


As well as grouping on a single term, you can create groups based on multiple columns by passing in several column names as a list. For example, generate groups based on commodity code *and* trade flow, and then preview the keys used to define the groups.

In [18]:
GROUPING_COMMFLOW = ['Commodity Code','Trade Flow']

groups = milk_countries.groupby(GROUPING_COMMFLOW)
groups.groups.keys()

dict_keys([('0401', 'Exports'), ('0401', 'Imports'), ('0402', 'Exports'), ('0402', 'Imports')])

Retrieve a group based on multiple group levels by passing in a tuple that specifies a value for each index column. For example, if a grouping is based on the `'Partner'` and `'Trade Flow'` columns, the argument of `get_group` has to be a partner/flow pair, like `('France', 'Import')` to  get all rows associated with imports from France.

In [19]:
GROUPING_PARTNERFLOW = ['Partner','Trade Flow']
groups = milk_countries.groupby(GROUPING_PARTNERFLOW)

GROUP_PARTNERFLOW= ('France','Imports')
groups.get_group( GROUP_PARTNERFLOW )

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
15,2014,201401,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1522872
68,2014,201402,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1444455
120,2014,201403,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1414291
171,2014,201404,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1912257
223,2014,201405,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1638838
273,2014,201406,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1449614
327,2014,201407,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,2096771
370,2014,201408,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1474883
416,2014,201409,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1259777
466,2014,201410,Imports,United Kingdom,France,"Milk and cream, neither concentrated nor sweet...",401,1483422


To find the leading partner for a particular commodity, group by commodity, get the desired group, and then sort the result.

In [20]:
groups = milk_countries.groupby(['Commodity Code'])
groups.get_group('0402').sort_values("Trade Value (US$)", ascending=False).head()

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
954,2014,201406,Exports,United Kingdom,Algeria,"Milk and cream, concentrated or sweetened",402,22411564
880,2014,201405,Exports,United Kingdom,Algeria,"Milk and cream, concentrated or sweetened",402,19656679
811,2014,201404,Exports,United Kingdom,Algeria,"Milk and cream, concentrated or sweetened",402,14875816
841,2014,201404,Exports,United Kingdom,Ireland,"Milk and cream, concentrated or sweetened",402,11712344
773,2014,201403,Exports,United Kingdom,Ireland,"Milk and cream, concentrated or sweetened",402,11015471


### Task

Using your own data set from Exercise 1, try to group the data in a variety of ways, finding the most significant trade partner in each case:

- by commodity, or commodity code
- by trade flow, commodity and year.

In [21]:
LOCATION='comtrade_flowers_kenya_monthly_20.csv'

In [22]:
# LOCATION = 'http://comtrade.un.org//api/get?max=5000&type=C&freq=M&px=HS&ps=2020&r=404&p=all&rg=1%2C2&cc=0603%2C0402&fmt=csv'

In [23]:
flowers = pd.read_csv(LOCATION, dtype={'Commodity Code':str, 'Reporter Code':str})
flowers.head(3)

Unnamed: 0,Classification,Year,Period,Period Desc.,Aggregate Level,Is Leaf Code,Trade Flow Code,Trade Flow,Reporter Code,Reporter,...,Qty,Alt Qty Unit Code,Alt Qty Unit,Alt Qty,Netweight (kg),Gross weight (kg),Trade Value (US$),CIF Trade Value (US$),FOB Trade Value (US$),Flag
0,HS,2020,202008,August 2020,4,0,1,Imports,404,Kenya,...,,,,,22,,69,,,0
1,HS,2020,202006,June 2020,4,0,2,Exports,404,Kenya,...,,,,,8386230,,33686647,,,0
2,HS,2020,202006,June 2020,4,0,2,Exports,404,Kenya,...,,,,,412,,936,,,0


In [24]:
COLUMNS = ['Year', 'Period','Trade Flow','Reporter', 'Partner', 'Commodity','Commodity Code','Trade Value (US$)']
flowers = flowers[COLUMNS]

In [25]:
flowers_world = flowers[flowers['Partner'] == 'World']
flowers_countries = flowers[flowers['Partner'] != 'World']

In [26]:
flowers_countries.to_csv('countryflowers.csv', index=False)

In [27]:
load_test = pd.read_csv('countryflowers.csv', dtype={'Commodity Code':str, 'Reporter Code':str})
load_test.head(5)

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
0,2020,202008,Imports,Kenya,Zambia,Flowers; cut flowers and flower buds of a kind...,603,69
1,2020,202006,Exports,Kenya,Rwanda,Flowers; cut flowers and flower buds of a kind...,603,936
2,2020,202006,Exports,Kenya,United States of America,Flowers; cut flowers and flower buds of a kind...,603,64186
3,2020,202006,Exports,Kenya,Switzerland,Flowers; cut flowers and flower buds of a kind...,603,811991
4,2020,202006,Exports,Kenya,France,Flowers; cut flowers and flower buds of a kind...,603,390820


In [28]:
load_test=pd.read_csv('countryflowers.csv', dtype={'Commodity Code':str}, encoding = "ISO-8859-1")
load_test.head()

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
0,2020,202008,Imports,Kenya,Zambia,Flowers; cut flowers and flower buds of a kind...,603,69
1,2020,202006,Exports,Kenya,Rwanda,Flowers; cut flowers and flower buds of a kind...,603,936
2,2020,202006,Exports,Kenya,United States of America,Flowers; cut flowers and flower buds of a kind...,603,64186
3,2020,202006,Exports,Kenya,Switzerland,Flowers; cut flowers and flower buds of a kind...,603,811991
4,2020,202006,Exports,Kenya,France,Flowers; cut flowers and flower buds of a kind...,603,390820


In [29]:
flowers_imports = flowers[flowers['Trade Flow'] == 'Imports']
flowers_countries_imports = flowers_countries[flowers_countries['Trade Flow'] == 'Imports']
flowers_world_imports=flowers_world[flowers_world['Trade Flow'] == 'Imports']

In [30]:
flowersImportsInJanuary2020 = flowers_countries_imports[flowers_countries_imports['Period'] == 202001]
flowersImportsInJanuary2020.sort_values('Trade Value (US$)',ascending=False).head(10)

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
893,2020,202001,Imports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,2202


In [31]:
flowers_exports = flowers[flowers['Trade Flow'] == 'Exports']
flowers_countries_exports = flowers_countries[flowers_countries['Trade Flow'] == 'Exports']
flowers_world_exports=flowers_world[flowers_world['Trade Flow'] == 'Exports']

In [32]:
flowersExportsInFebruary2020 = flowers_countries_exports[flowers_countries_exports['Period'] == 202002]
flowersExportsInFebruary2020.sort_values('Trade Value (US$)',ascending=False).head(10)

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
737,2020,202002,Exports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,30593127
774,2020,202002,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,13994026
729,2020,202002,Exports,Kenya,Russian Federation,Flowers; cut flowers and flower buds of a kind...,603,3150555
741,2020,202002,Exports,Kenya,Germany,Flowers; cut flowers and flower buds of a kind...,603,3131723
692,2020,202002,Exports,Kenya,Norway,Flowers; cut flowers and flower buds of a kind...,603,2627742
702,2020,202002,Exports,Kenya,Australia,Flowers; cut flowers and flower buds of a kind...,603,2108650
769,2020,202002,Exports,Kenya,United Arab Emirates,Flowers; cut flowers and flower buds of a kind...,603,2005308
731,2020,202002,Exports,Kenya,Saudi Arabia,Flowers; cut flowers and flower buds of a kind...,603,1809338
690,2020,202002,Exports,Kenya,Switzerland,Flowers; cut flowers and flower buds of a kind...,603,987509
693,2020,202002,Exports,Kenya,France,Flowers; cut flowers and flower buds of a kind...,603,921073


In [33]:
groups = flowers_countries.groupby('Trade Flow')

In [34]:
groups.get_group('Exports').head()

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
2,2020,202006,Exports,Kenya,Rwanda,Flowers; cut flowers and flower buds of a kind...,603,936
3,2020,202006,Exports,Kenya,United States of America,Flowers; cut flowers and flower buds of a kind...,603,64186
4,2020,202006,Exports,Kenya,Switzerland,Flowers; cut flowers and flower buds of a kind...,603,811991
5,2020,202006,Exports,Kenya,France,Flowers; cut flowers and flower buds of a kind...,603,390820
6,2020,202006,Exports,Kenya,"Other Asia, nes",Flowers; cut flowers and flower buds of a kind...,603,912


In [35]:
GROUPING_COMMFLOW = ['Commodity Code','Trade Flow']

groups = flowers_countries.groupby(GROUPING_COMMFLOW)
groups.groups.keys()

dict_keys([('0603', 'Exports'), ('0603', 'Imports')])

In [36]:
GROUPING_PARTNERFLOW = ['Partner','Trade Flow']
groups = flowers_countries.groupby(GROUPING_PARTNERFLOW)

GROUP_PARTNERFLOW= ('United Kingdom','Exports')
groups.get_group( GROUP_PARTNERFLOW )

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
44,2020,202006,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,7431869
144,2020,202012,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,12087907
249,2020,202001,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,10442336
316,2020,202008,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,10160604
329,2020,202005,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,7548314
441,2020,202010,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,9728942
500,2020,202011,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,9835308
550,2020,202007,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,9322490
682,2020,202004,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,6160190
774,2020,202002,Exports,Kenya,United Kingdom,Flowers; cut flowers and flower buds of a kind...,603,13994026


In [37]:
groups = flowers_countries.groupby(['Commodity Code'])
groups.get_group('0603').sort_values("Trade Value (US$)", ascending=False).head()

Unnamed: 0,Year,Period,Trade Flow,Reporter,Partner,Commodity,Commodity Code,Trade Value (US$)
737,2020,202002,Exports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,30593127
92,2020,202012,Exports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,27239029
451,2020,202010,Exports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,25991010
226,2020,202001,Exports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,25588432
816,2020,202003,Exports,Kenya,Netherlands,Flowers; cut flowers and flower buds of a kind...,603,23799740


Netherlands is a significant trade partner as both an importer and also the sole exporter of flowers to Kenya