# Creating New Measures

Sometimes datasets may be missing key metrics or measures that would be useful for an analysis. While getting these measures from other datasets from a join is one option, if the data (ingredients) is already in the existing dataset, then we can derive the new measures with functions/formulas

## The Dataset...

The data used in these examples is dummy data.

It is developed from a combination of Wikipedia pages and random generated numbers.

Wiki Pages:

#### Products

- https://en.wikipedia.org/wiki/List_of_culinary_fruits
- https://en.wikipedia.org/wiki/List_of_vegetables

#### Store Names (random suburbs in Sydney)

- https://en.wikipedia.org/wiki/List_of_Sydney_suburbs

In [1]:
# Import the dependencies

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

In [2]:
# Import the dataset

sales_data = pd.read_excel(r'../Data/SalesDataset.xlsx')

# Quick view of the data "The Head"
sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293


## Derive Net Sales from Gross_Sales and Discount Fields

In [3]:
# Calculate Net Sales from 2 fields/columns

sales_data["Net_Sales"] = sales_data["Gross_Sales"] * (1 - sales_data["Discount"])       

sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5,14.85
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927,0.0
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.74688
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447,649.957952
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293,520.907713


In [4]:
# Calculate Price from 2 fields/columns

sales_data["Gross_Price"] = sales_data["Gross_Sales"] / sales_data["Units"]
sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5,14.85,0.03
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927,0.0,0.0
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.74688,1.85
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447,649.957952,2.1
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293,520.907713,1.5


In [5]:
# Tropical Fruit attracts a rebate that we have to pay (3%)

def tropical_fruit_rebate(Product_Group):
    if Product_Group == "Tropical Fruit":
        return 0.03
    else:
        return 0
    
sales_data["rebates"] = sales_data["Product_Group"].apply(tropical_fruit_rebate) * sales_data["Gross_Sales"]

sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price,rebates
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5,14.85,0.03,0.891
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927,0.0,0.0,0.0
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.74688,1.85,37.2405
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447,649.957952,2.1,38.493
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293,520.907713,1.5,30.78


In [6]:
# Calculate Net Price

sales_data["Net_Price"] = sales_data["Net_Sales"] / sales_data["Units"]
sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price,rebates,Net_Price
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5,14.85,0.03,0.891,0.015
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927,0.0,0.0,0.0,0.0
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.74688,1.85,37.2405,0.935539
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447,649.957952,2.1,38.493,1.063761
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293,520.907713,1.5,30.78,0.761561


In [7]:
# Alternative to calculating Net Price

sales_data["Net_Price_2"] = (sales_data["Gross_Sales"] * (1 - sales_data["Discount"])) / sales_data["Units"]
sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price,rebates,Net_Price,Net_Price_2
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5,14.85,0.03,0.891,0.015,0.015
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927,0.0,0.0,0.0,0.0,0.0
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.74688,1.85,37.2405,0.935539,0.935539
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447,649.957952,2.1,38.493,1.063761,1.063761
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293,520.907713,1.5,30.78,0.761561,0.761561


## More advanced methods...

In [8]:
# Different Customer Groups create different administrative costs for us

# Review the Customer Groups...

sales_data["Customer_Group"].unique()



array(["A Market That's Super", 'Super Super Market', 'Market',
       'Not So Super Market'], dtype=object)

#### The business briefs us and says 

- Super Super Market = Assume 1.5% admin cost on Gross Sales
- Not So Super Market = Assume 1.7% admin cost on Gross Sales
- A Market That's Super = Assume 1.3% admin cost on Gross Sales
- Market = Assume 1.15% admin cost on Gross Sales

In [9]:
# Let's create the function

def find_admin_costs(Customer_Group):
    if Customer_Group == "Super Super Market":
        return 0.015
    elif Customer_Group == "Not So Super Market":
        return 0.017
    elif Customer_Group == "A Market That's Super":
        return 0.013
    elif Customer_Group == "Market":
        return 0.0115
    else:
        return 0
    
sales_data["Admin_Costs"] = sales_data["Customer_Group"].apply(find_admin_costs) * sales_data["Gross_Sales"]

sales_data.head(100)


Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price,rebates,Net_Price,Net_Price_2,Admin_Costs
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.70,0.500000,14.850000,0.03,0.8910,0.015000,0.015000,0.386100
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.00,0.498927,0.000000,0.00,0.0000,0.000000,0.000000,0.000000
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.746880,1.85,37.2405,0.935539,0.935539,14.275525
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.10,0.493447,649.957952,2.10,38.4930,1.063761,1.063761,16.680300
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.00,0.492293,520.907713,1.50,30.7800,0.761561,0.761561,13.338000
5,29/02/2020,,A Market That's Super,2003,Berowra Waters,Fruit,Tropical Fruit,Ooray,1100082,685,938.45,0.491922,476.805579,1.37,28.1535,0.696067,0.696067,12.199850
6,31/07/2020,,A Market That's Super,2000,Berowra,Fruit,Tropical Fruit,Passiflora platyloba,1100084,756,3054.24,0.490601,1555.828168,4.04,91.6272,2.057974,2.057974,39.705120
7,31/08/2020,,Super Super Market,1001,Balgowlah,Fruit,Tropical Fruit,Wild jack,1100112,691,608.08,0.489313,310.538493,0.88,18.2424,0.449404,0.449404,9.121200
8,31/12/2019,1000000.0,Super Super Market,1009,Bankstown Aerodrome,Fruit,Tropical Fruit,Ice-cream bean,1100058,998,0.00,0.500000,0.000000,0.00,0.0000,0.000000,0.000000,0.000000
9,30/11/2020,,A Market That's Super,2000,Berowra,Fruit,Tropical Fruit,Mammee,1100069,685,2664.65,0.485816,1370.119351,3.89,79.9395,2.000174,2.000174,34.640450


### Business Forgets to tell you about an exception to this logic...

Customer Group "Super Super Market" doesn't charge us admin fees for Stores "Bangor" and "Bankstown Aerodrome", because we negotiated a deal with them (fictional remember??)

In [10]:
# Concat Customer Group and Store Name

sales_data["Group_Store_Concat"] = sales_data["Customer_Group"] + " - " + sales_data["Store_Name"]
sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price,rebates,Net_Price,Net_Price_2,Admin_Costs,Group_Store_Concat
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.7,0.5,14.85,0.03,0.891,0.015,0.015,0.3861,A Market That's Super - Berowra Creek
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.0,0.498927,0.0,0.0,0.0,0.0,0.0,0.0,Super Super Market - Bardia
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.74688,1.85,37.2405,0.935539,0.935539,14.275525,Market - Blackett
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.1,0.493447,649.957952,2.1,38.493,1.063761,1.063761,16.6803,A Market That's Super - Bilgola Beach
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.0,0.492293,520.907713,1.5,30.78,0.761561,0.761561,13.338,A Market That's Super - Beverly Hills


In [11]:
def exceptions(Concat_Store_Name):
    if Concat_Store_Name == "Super Super Market - Bangor" or Concat_Store_Name == "Super Super Market - Bankstown Aerodrome":
        return 0
    else:
        return 1


sales_data["admin_exception_flag"] = sales_data["Group_Store_Concat"].apply(exceptions)

sales_data["Admin_CostsV2"] = sales_data["Admin_Costs"] * sales_data["admin_exception_flag"]

sales_data.drop(['admin_exception_flag'], axis=1, inplace=True)


sales_data.head(100)

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,Gross_Sales,Discount,Net_Sales,Gross_Price,rebates,Net_Price,Net_Price_2,Admin_Costs,Group_Store_Concat,Admin_CostsV2
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,29.70,0.500000,14.850000,0.03,0.8910,0.015000,0.015000,0.386100,A Market That's Super - Berowra Creek,0.386100
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,0.00,0.498927,0.000000,0.00,0.0000,0.000000,0.000000,0.000000,Super Super Market - Bardia,0.000000
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,1241.35,0.494303,627.746880,1.85,37.2405,0.935539,0.935539,14.275525,Market - Blackett,14.275525
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,1283.10,0.493447,649.957952,2.10,38.4930,1.063761,1.063761,16.680300,A Market That's Super - Bilgola Beach,16.680300
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,1026.00,0.492293,520.907713,1.50,30.7800,0.761561,0.761561,13.338000,A Market That's Super - Beverly Hills,13.338000
5,29/02/2020,,A Market That's Super,2003,Berowra Waters,Fruit,Tropical Fruit,Ooray,1100082,685,938.45,0.491922,476.805579,1.37,28.1535,0.696067,0.696067,12.199850,A Market That's Super - Berowra Waters,12.199850
6,31/07/2020,,A Market That's Super,2000,Berowra,Fruit,Tropical Fruit,Passiflora platyloba,1100084,756,3054.24,0.490601,1555.828168,4.04,91.6272,2.057974,2.057974,39.705120,A Market That's Super - Berowra,39.705120
7,31/08/2020,,Super Super Market,1001,Balgowlah,Fruit,Tropical Fruit,Wild jack,1100112,691,608.08,0.489313,310.538493,0.88,18.2424,0.449404,0.449404,9.121200,Super Super Market - Balgowlah,9.121200
8,31/12/2019,1000000.0,Super Super Market,1009,Bankstown Aerodrome,Fruit,Tropical Fruit,Ice-cream bean,1100058,998,0.00,0.500000,0.000000,0.00,0.0000,0.000000,0.000000,0.000000,Super Super Market - Bankstown Aerodrome,0.000000
9,30/11/2020,,A Market That's Super,2000,Berowra,Fruit,Tropical Fruit,Mammee,1100069,685,2664.65,0.485816,1370.119351,3.89,79.9395,2.000174,2.000174,34.640450,A Market That's Super - Berowra,34.640450


## Let's try the above in a more programmatic way...

In [12]:
def find_admin_costs(Customer_Group):
    if Customer_Group == "Super Super Market":
        return 0.015
    elif Customer_Group == "Not So Super Market":
        return 0.017
    elif Customer_Group == "A Market That's Super":
        return 0.013
    elif Customer_Group == "Market":
        return 0.0115
    else:
        return 0


def prog_admin_cost(col):
    if col["Customer_Group"] == "Super Super Market" and \
    col["Store_Name"] == "Bangor" or \
    col["Store_Name"] == "Bankstown Aerodrome":
        return 0
    else:
        return find_admin_costs(col["Customer_Group"]) * col["Gross_Sales"]
    
sales_data["Admin_CostsV3"] = sales_data.apply(prog_admin_cost, axis=1)
sales_data.head()

Unnamed: 0,Date,Campaign_ID,Customer_Group,Store_ID,Store_Name,Product_Category,Product_Group,Product,Product_ID,Units,...,Discount,Net_Sales,Gross_Price,rebates,Net_Price,Net_Price_2,Admin_Costs,Group_Store_Concat,Admin_CostsV2,Admin_CostsV3
0,31/12/2019,1000000.0,A Market That's Super,2001,Berowra Creek,Fruit,Tropical Fruit,Hydnora abyssinica,1100057,990,...,0.5,14.85,0.03,0.891,0.015,0.015,0.3861,A Market That's Super - Berowra Creek,0.3861,0.3861
1,30/04/2020,,Super Super Market,1012,Bardia,Fruit,Tropical Fruit,Salak,1100094,630,...,0.498927,0.0,0.0,0.0,0.0,0.0,0.0,Super Super Market - Bardia,0.0,0.0
2,31/07/2020,,Market,3000,Blackett,Fruit,Tropical Fruit,Kola nut,1100062,671,...,0.494303,627.74688,1.85,37.2405,0.935539,0.935539,14.275525,Market - Blackett,14.275525,14.275525
3,31/10/2020,,A Market That's Super,2011,Bilgola Beach,Fruit,Tropical Fruit,Jackfruit,1100060,611,...,0.493447,649.957952,2.1,38.493,1.063761,1.063761,16.6803,A Market That's Super - Bilgola Beach,16.6803,16.6803
4,31/10/2020,,A Market That's Super,2006,Beverly Hills,Fruit,Tropical Fruit,Terap,1100107,684,...,0.492293,520.907713,1.5,30.78,0.761561,0.761561,13.338,A Market That's Super - Beverly Hills,13.338,13.338
