In [None]:
import pandas as pd

In [None]:
fortune = pd.read_csv("data/fortune1000.csv", index_col="Rank")

# group by sector
sectors = fortune.groupby("Sector")
fortune.head()

## .groupby() method

In [None]:
# print length of unique sector
len(sectors)

# same as
fortune["Sector"].nunique()

In [None]:
# no. of rows in each group
# in alphabetical order
sectors.size()

In [None]:
# same as size method
fortune["Sector"].value_counts()

In [None]:
# extract first row in each group
sectors.first()

In [None]:
# last row in each group
sectors.last()

In [None]:
# returns dictionary with key as group and values as row no. of every row in that group
sectors.groups

## retrieve group with .get_group() method

In [None]:
# returns all rows in group as DataFrame
sectors.get_group("Technology")

## methods on Groupby Object and DataFrame Columns

In [None]:
# returns based on Company which comes last alphabetically in group
sectors.max()

In [None]:
sectors.min()

In [None]:
# sum numerical values in each group
sectors.sum()

In [None]:
sectors.mean()

In [None]:
# returns series, having value equal to sum of revenue in each group
sectors["Revenue"].sum()

In [None]:
sectors["Employees"].mean()

In [None]:
sectors[["Revenue", "Profits"]].sum()

## Grouping by Multiple columns

In [None]:
sectors = fortune.groupby(["Sector", "Industry"])

In [None]:
sectors.size()

In [None]:
sectors.sum()

In [None]:
sectors["Employees"].mean()

## .agg() method

In [None]:
sectors = fortune.groupby("Sector")
sectors.size()

In [None]:
# perform different operation on diff column
sectors.agg({
    "Revenue": "sum",
    "Profits": "sum",
    "Employees": "mean"
})

In [None]:
# multiple operations to multiple columns
sectors.agg(["size", "sum", "mean"])

In [None]:
# can be nested
sectors.agg({
    "Revenue": ["sum", "mean"],
    "Profits": "sum",
    "Employees": "mean"
})

## Iterating through groups

In [None]:
# iterate through each group, form a DataFrame wiith rows in each group which have maximum profit or revenue

# creating DataFrame
df = pd.DataFrame(columns = fortune.columns)
df

In [None]:
for sector,data in sectors:
    highest_revenue = data.nlargest(1, "Revenue")
    df = df.append(highest_revenue)

In [None]:
df

In [None]:
cities = fortune.groupby("Location")
df = pd.DataFrame(columns = fortune.columns)
df

In [None]:
for city,data in cities:
    highest_profit = data.nlargest(1, "Profits")
    df = df.append(highest_profit)

In [None]:
df