# GroupBy

### 1. Intro to GroupBy Module

In [1]:
import pandas as pd

<b>NOTE:</b> We already have a rank column, which can serve as a pretty effective index So I'm going to use it to overwrite the default Pandas index.

In [25]:
# fortune = pd.read_csv("../00_Datasers/fortune1000.csv")
fortune = pd.read_csv("../00_Datasers/fortune1000.csv", index_col = "Rank")
sectors = fortune.groupby("Sector")
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [8]:
type(fortune)

pandas.core.frame.DataFrame

In [9]:
type(sectors)

pandas.core.groupby.DataFrameGroupBy

### 2. The .groupby() Method

<b>NOTE:</b> If we pass DataFrame objest to python len method we will get number of rows.

In [10]:
len(fortune)

1000

<b>NOTE:</b> If we pass DataFrameGroupBy object to python len method we will get number of groupings.

In [11]:
len(sectors)

21

In [12]:
fortune["Sector"].nunique()

21

<b>NOTE:</b> The size method shows how many rows falls into every grouping.

In [14]:
sectors.size()

Sector
Aerospace & Defense              20
Apparel                          15
Business Services                51
Chemicals                        30
Energy                          122
Engineering & Construction       26
Financials                      139
Food and Drug Stores             15
Food, Beverages & Tobacco        43
Health Care                      75
Hotels, Resturants & Leisure     25
Household Products               28
Industrials                      46
Materials                        43
Media                            25
Motor Vehicles & Parts           24
Retailing                        80
Technology                      102
Telecommunications               15
Transportation                   36
Wholesalers                      40
dtype: int64

In [15]:
fortune["Sector"].value_counts()

Financials                      139
Energy                          122
Technology                      102
Retailing                        80
Health Care                      75
Business Services                51
Industrials                      46
Materials                        43
Food, Beverages & Tobacco        43
Wholesalers                      40
Transportation                   36
Chemicals                        30
Household Products               28
Engineering & Construction       26
Media                            25
Hotels, Resturants & Leisure     25
Motor Vehicles & Parts           24
Aerospace & Defense              20
Apparel                          15
Telecommunications               15
Food and Drug Stores             15
Name: Sector, dtype: int64

In [16]:
sectors.first()

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,Boeing,Aerospace and Defense,"Chicago, IL",96114,5176,161400
Apparel,Nike,Apparel,"Beaverton, OR",30601,3273,62600
Business Services,ManpowerGroup,Temporary Help,"Milwaukee, WI",19330,419,27000
Chemicals,Dow Chemical,Chemicals,"Midland, MI",48778,7685,49495
Energy,Exxon Mobil,Petroleum Refining,"Irving, TX",246204,16150,75600
Engineering & Construction,Fluor,"Engineering, Construction","Irving, TX",18114,413,38758
Financials,Berkshire Hathaway,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
Food and Drug Stores,CVS Health,Food and Drug Stores,"Woonsocket, RI",153290,5237,199000
"Food, Beverages & Tobacco",Archer Daniels Midland,Food Production,"Chicago, IL",67702,1849,32300
Health Care,McKesson,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [26]:
sectors.groups

{'Aerospace & Defense': Int64Index([ 24,  45,  60,  88, 118, 120, 209, 245, 282, 378, 389, 490, 560,
             605, 785, 788, 836, 903, 958, 987],
            dtype='int64', name='Rank'),
 'Apparel': Int64Index([91, 231, 340, 354, 448, 547, 575, 597, 683, 695, 726, 794, 877,
             882, 917],
            dtype='int64', name='Rank'),
 'Business Services': Int64Index([144, 186, 199, 204, 221, 248, 249, 294, 307, 312, 355, 392, 404,
             440, 467, 468, 481, 485, 492, 503, 545, 626, 635, 652, 677, 694,
             714, 729, 734, 735, 737, 744, 767, 776, 777, 783, 791, 792, 796,
             801, 803, 816, 819, 820, 869, 870, 886, 939, 951, 952, 993],
            dtype='int64', name='Rank'),
 'Chemicals': Int64Index([ 56, 101, 182, 189, 206, 253, 262, 277, 288, 296, 316, 538, 549,
             555, 566, 580, 613, 624, 654, 668, 717, 720, 724, 758, 761, 829,
             865, 898, 934, 949],
            dtype='int64', name='Rank'),
 'Energy': Int64Index([  2,  14,  30,  32,

In [27]:
fortune.loc[24]

Company                     Boeing
Sector         Aerospace & Defense
Industry     Aerospace and Defense
Location               Chicago, IL
Revenue                      96114
Profits                       5176
Employees                   161400
Name: 24, dtype: object

### 3. Retrieve A Group with the .get_group() Method

In [28]:
fortune = pd.read_csv("../00_Datasers/fortune1000.csv", index_col = "Rank")
sectors = fortune.groupby("Sector")
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [29]:
sectors.get_group("Energy")

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
14,Chevron,Energy,Petroleum Refining,"San Ramon, CA",131118,4587,61500
30,Phillips 66,Energy,Petroleum Refining,"Houston, TX",87169,4227,14000
32,Valero Energy,Energy,Petroleum Refining,"San Antonio, TX",81824,3990,10103
42,Marathon Petroleum,Energy,Petroleum Refining,"Findlay, OH",64566,2852,45440
65,Energy Transfer Equity,Energy,Pipelines,"Dallas, TX",42126,1189,30078
90,ConocoPhillips,Energy,"Mining, Crude-Oil Production","Houston, TX",30935,-4428,15900
95,Exelon,Energy,Utilities: Gas and Electric,"Chicago, IL",29447,2269,29762
98,Tesoro,Energy,Petroleum Refining,"San Antonio, TX",28150,1540,6016
104,Enterprise Products Partners,Energy,Pipelines,"Houston, TX",27028,2521,6800


In [30]:
sectors.get_group("Technology")

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
18,Amazon.com,Technology,Internet Services and Retailing,"Seattle, WA",107006,596,230800
20,HP,Technology,"Computers, Office Equipment","Palo Alto, CA",103355,4554,287000
25,Microsoft,Technology,Computer Software,"Redmond, WA",93580,12193,118000
31,IBM,Technology,Information Technology Services,"Armonk, NY",82461,13190,411798
36,Alphabet,Technology,Internet Services and Retailing,"Mountain View, CA",74989,16348,61814
51,Intel,Technology,Semiconductors and Other Electronic Components,"Santa Clara, CA",55355,11420,107300
54,Cisco Systems,Technology,Network and Other Communications Equipment,"San Jose, CA",49161,8981,71833
77,Oracle,Technology,Computer Software,"Redwood City, CA",38226,9938,132000
110,Qualcomm,Technology,Network and Other Communications Equipment,"San Diego, CA",25281,5271,33000


In [31]:
sectors.get_group("Apparel")

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
91,Nike,Apparel,Apparel,"Beaverton, OR",30601,3273,62600
231,VF,Apparel,Apparel,"Greensboro, NC",12377,1232,64000
340,PVH,Apparel,Apparel,"New York, NY",8020,572,26200
354,Ralph Lauren,Apparel,Apparel,"New York, NY",7620,702,20000
448,Hanesbrands,Apparel,Apparel,"Winston-Salem, NC",5732,429,65300
547,Levi Strauss,Apparel,Apparel,"San Francisco, CA",4495,209,12500
575,Coach,Apparel,Apparel,"New York, NY",4192,402,12950
597,Under Armour,Apparel,Apparel,"Baltimore, MD",3963,233,9600
683,Fossil Group,Apparel,Apparel,"Richardson, TX",3229,221,15100
695,Skechers U.S.A.,Apparel,Apparel,"Manhattan Beach, CA",3159,232,6400


### 4. Methods on the Groupby Object and DataFrame Columns

In [34]:
sectors.max()
sectors.min()
sectors.sum()

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,357940,28742,968057
Apparel,95968,8236,346397
Business Services,272195,28227,1361050
Chemicals,243897,22628,463651
Energy,1517809,-73447,1188927
Engineering & Construction,153983,5304,406708
Financials,2217159,260209,3359948
Food and Drug Stores,483769,16759,1395398
"Food, Beverages & Tobacco",555967,51417,1211632
Health Care,1614707,106114,2678289


In [35]:
sectors.get_group("Apparel")["Profits"].sum()

8236

In [36]:
sectors.mean()

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,17897.0,1437.1,48402.85
Apparel,6397.866667,549.066667,23093.133333
Business Services,5337.156863,553.470588,26687.254902
Chemicals,8129.9,754.266667,15455.033333
Energy,12441.057377,-602.02459,9745.303279
Engineering & Construction,5922.423077,204.0,15642.615385
Financials,15950.784173,1872.007194,24172.28777
Food and Drug Stores,32251.266667,1117.266667,93026.533333
"Food, Beverages & Tobacco",12929.465116,1195.744186,28177.488372
Health Care,21529.426667,1414.853333,35710.52


In [40]:
sectors["Revenue"].sum()
sectors["Employees"].sum()
sectors["Profits"].max()
sectors["Employees"].mean()

Sector
Aerospace & Defense             48402.850000
Apparel                         23093.133333
Business Services               26687.254902
Chemicals                       15455.033333
Energy                           9745.303279
Engineering & Construction      15642.615385
Financials                      24172.287770
Food and Drug Stores            93026.533333
Food, Beverages & Tobacco       28177.488372
Health Care                     35710.520000
Hotels, Resturants & Leisure    99369.800000
Household Products              23072.785714
Industrials                     33591.934783
Materials                       14840.069767
Media                           22012.560000
Motor Vehicles & Parts          45106.666667
Retailing                       77845.362500
Technology                      35087.735294
Telecommunications              55497.866667
Transportation                  42688.694444
Wholesalers                     13139.925000
Name: Employees, dtype: float64

### 5. Grouping by Multiple Columns

In [42]:
fortune = pd.read_csv("../00_Datasers/fortune1000.csv", index_col = "Rank")
sectors = fortune.groupby(["Sector", "Industry"])
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [46]:
sectors.size()
sectors.sum()
sectors["Revenue"].sum()
sectors["Employees"].mean()

Sector                      Industry                                      
Aerospace & Defense         Aerospace and Defense                              48402.850000
Apparel                     Apparel                                            23093.133333
Business Services           Advertising, marketing                             62050.000000
                            Diversified Outsourcing Services                   50595.000000
                            Education                                          15585.000000
                            Financial Data Services                            13943.473684
                            Miscellaneous                                      12573.333333
                            Temporary Help                                     12004.000000
                            Waste Management                                   23839.800000
Chemicals                   Chemicals                                          15455.033333
Energ

### 6. The .agg() Method

In [47]:
fortune = pd.read_csv("../00_Datasers/fortune1000.csv", index_col = "Rank")
sectors = fortune.groupby("Sector")
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [48]:
sectors.agg({"Revenue" : "sum", 
            "Profits" : "sum",
            "Employees" : "mean"})

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,357940,28742,48402.85
Apparel,95968,8236,23093.133333
Business Services,272195,28227,26687.254902
Chemicals,243897,22628,15455.033333
Energy,1517809,-73447,9745.303279
Engineering & Construction,153983,5304,15642.615385
Financials,2217159,260209,24172.28777
Food and Drug Stores,483769,16759,93026.533333
"Food, Beverages & Tobacco",555967,51417,28177.488372
Health Care,1614707,106114,35710.52


In [49]:
sectors.agg(["size", "sum", "mean"])

Unnamed: 0_level_0,Revenue,Revenue,Revenue,Profits,Profits,Profits,Employees,Employees,Employees
Unnamed: 0_level_1,size,sum,mean,size,sum,mean,size,sum,mean
Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Aerospace & Defense,20,357940,17897.0,20,28742,1437.1,20,968057,48402.85
Apparel,15,95968,6397.866667,15,8236,549.066667,15,346397,23093.133333
Business Services,51,272195,5337.156863,51,28227,553.470588,51,1361050,26687.254902
Chemicals,30,243897,8129.9,30,22628,754.266667,30,463651,15455.033333
Energy,122,1517809,12441.057377,122,-73447,-602.02459,122,1188927,9745.303279
Engineering & Construction,26,153983,5922.423077,26,5304,204.0,26,406708,15642.615385
Financials,139,2217159,15950.784173,139,260209,1872.007194,139,3359948,24172.28777
Food and Drug Stores,15,483769,32251.266667,15,16759,1117.266667,15,1395398,93026.533333
"Food, Beverages & Tobacco",43,555967,12929.465116,43,51417,1195.744186,43,1211632,28177.488372
Health Care,75,1614707,21529.426667,75,106114,1414.853333,75,2678289,35710.52


### 7. Iterating through Groups

In [50]:
fortune = pd.read_csv("../00_Datasers/fortune1000.csv", index_col = "Rank")
sectors = fortune.groupby("Sector")
fortune.head(3)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000


In [53]:
df = pd.DataFrame(columns = fortune.columns)

In [57]:
for sector, data in sectors:
    highest_revenue_company_in_group = data.nlargest(1, "Revenue")
    df = df.append(highest_revenue_company_in_group)

In [58]:
df

Unnamed: 0,Company,Sector,Industry,Location,Revenue,Profits,Employees
24,Boeing,Aerospace & Defense,Aerospace and Defense,"Chicago, IL",96114,5176,161400
91,Nike,Apparel,Apparel,"Beaverton, OR",30601,3273,62600
144,ManpowerGroup,Business Services,Temporary Help,"Milwaukee, WI",19330,419,27000
56,Dow Chemical,Chemicals,Chemicals,"Midland, MI",48778,7685,49495
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
155,Fluor,Engineering & Construction,"Engineering, Construction","Irving, TX",18114,413,38758
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
7,CVS Health,Food and Drug Stores,Food and Drug Stores,"Woonsocket, RI",153290,5237,199000
41,Archer Daniels Midland,"Food, Beverages & Tobacco",Food Production,"Chicago, IL",67702,1849,32300
5,McKesson,Health Care,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [59]:
cities = fortune.groupby("Location")
df = pd.DataFrame(columns = fortune.columns)

In [60]:
for city, data in cities:
    highest_revenue_in_city = data.nlargest(1, "Revenue")
    df = df.append(highest_revenue_in_city)

In [61]:
df

Unnamed: 0,Company,Sector,Industry,Location,Revenue,Profits,Employees
138,Abbott Laboratories,Health Care,Medical Products and Equipment,"Abbott Park, IL",20661,4423,74000
169,Goodyear Tire & Rubber,Motor Vehicles & Parts,Motor Vehicles and Parts,"Akron, OH",16443,307,66000
288,Air Products & Chemicals,Chemicals,Chemicals,"Allentown, PA",9895,1278,19550
830,Benchmark Electronics,Technology,Semiconductors and Other Electronic Components,"Angleton, TX",2541,95,10500
374,Casey’s General Stores,Retailing,Specialty Retailers: Other,"Ankeny, IA",7052,181,22408
915,Domino’s Pizza,"Hotels, Resturants & Leisure",Food Services,"Ann Arbor, MI",2217,193,11900
596,Colfax,Industrials,Industrial Machinery,"Annapolis Junction, MD",3967,168,17087
215,Land O’Lakes,"Food, Beverages & Tobacco",Food Consumer Products,"Arden Hills, MN",13161,308,10000
190,AES,Energy,Utilities: Gas and Electric,"Arlington, VA",14963,306,21000
31,IBM,Technology,Information Technology Services,"Armonk, NY",82461,13190,411798
