In [615]:
import pandas as pd
import calendar
import numpy as np

# Read in data

In [616]:
click_df = pd.read_csv("clicks_fact.csv")
click_df

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income
0,1/1/2021,1,4,71.00,77,40,898,395
1,1/2/2021,1,2,43.00,45,46,1220,1098
2,1/3/2021,1,2,37.00,69,40,1217,188
3,1/4/2021,1,1,64.00,76,16,162,413
4,1/5/2021,1,2,113.00,66,58,1108,1171
...,...,...,...,...,...,...,...,...
1539,7/8/2021,8,1,55.00,48,56,1591,1394
1540,7/9/2021,8,1,96.00,64,62,533,1502
1541,7/10/2021,8,2,72.00,79,12,1821,1711
1542,7/11/2021,8,1,92.00,59,28,1242,643


In [617]:
site_df = pd.read_csv("company_map.csv")
site_df

Unnamed: 0,site_name,site_id
0,195bet,1
1,adriaticmarine.no,2
2,antek.se,3
3,arisitum.org,4
4,bingobrickan.se,5
5,casinoistockholm.se,6
6,grattage.com,7
7,natcasino.nu,8


In [618]:
brand_df = pd.read_csv("brand_map.csv")
brand_df

Unnamed: 0,brand_name,brand_id
0,LeoVegas,1
1,Bet365,2
2,Betfair,3
3,Bertil,4


# Task 1: Create a new table with aggregated data per
- day
- site name
- brand name

## Do we have any duplicated dates? Yes Have 1351 duplicated dates, meaning that the clicks_df is un-aggregated
- Part two, group by the date, site_id, and brand_id, and create aggregated rows of the other data

In [619]:

click_df.Date.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1539     True
1540     True
1541     True
1542     True
1543     True
Name: Date, Length: 1544, dtype: bool

In [620]:
# Extract the duplicated rows
click_df.loc[click_df.Date.duplicated(), :]

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income
193,1/1/2021,2,1,45.00,71,23,1143,351
194,1/2/2021,2,1,102.00,87,58,1478,1492
195,1/3/2021,2,1,56.00,35,33,1588,1692
196,1/4/2021,2,4,99.00,39,64,590,1668
197,1/5/2021,2,2,91.00,55,16,1049,1668
...,...,...,...,...,...,...,...,...
1539,7/8/2021,8,1,55.00,48,56,1591,1394
1540,7/9/2021,8,1,96.00,64,62,533,1502
1541,7/10/2021,8,2,72.00,79,12,1821,1711
1542,7/11/2021,8,1,92.00,59,28,1242,643


In [621]:
click_df = click_df.fillna(0)

In [622]:
# per_day
daily_click_stats_df = click_df.groupby(['Date', "site_id", "brand_id"])["clicks", "signups", "ftds", 'cpa_income', 'rev_income'].apply(lambda x : x.astype(int).sum())
daily_click_stats_df

  daily_click_stats_df = click_df.groupby(['Date', "site_id", "brand_id"])["clicks", "signups", "ftds", 'cpa_income', 'rev_income'].apply(lambda x : x.astype(int).sum())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clicks,signups,ftds,cpa_income,rev_income
Date,site_id,brand_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1/1/2021,1,4,71,77,40,898,395
1/1/2021,2,1,45,71,23,1143,351
1/1/2021,3,1,100,42,39,1769,1057
1/1/2021,4,2,72,51,57,1395,641
1/1/2021,5,3,0,87,26,1876,964
...,...,...,...,...,...,...,...
7/9/2021,4,4,34,67,44,1926,400
7/9/2021,5,1,84,83,53,1144,116
7/9/2021,6,4,53,36,22,1413,1637
7/9/2021,7,1,106,61,23,888,1597


## Currently there is no data type for 'Date' because it is an index. The date needs to be converted to the data type "datetime" so we can use in in upcoming clensing. 
1.  Create a new index in the first colunn
2. Set the Date column to be a datetime data type.
3. Check that the data type changed


In [623]:
daily_click_stats_df = daily_click_stats_df.reset_index()
daily_click_stats_df

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income
0,1/1/2021,1,4,71,77,40,898,395
1,1/1/2021,2,1,45,71,23,1143,351
2,1/1/2021,3,1,100,42,39,1769,1057
3,1/1/2021,4,2,72,51,57,1395,641
4,1/1/2021,5,3,0,87,26,1876,964
...,...,...,...,...,...,...,...,...
1539,7/9/2021,4,4,34,67,44,1926,400
1540,7/9/2021,5,1,84,83,53,1144,116
1541,7/9/2021,6,4,53,36,22,1413,1637
1542,7/9/2021,7,1,106,61,23,888,1597


### This block of code shows the data type of Date is object

In [624]:
# Change date column to date data type
print(daily_click_stats_df.dtypes)

Date          object
site_id        int64
brand_id       int64
clicks         int64
signups        int64
ftds           int64
cpa_income     int64
rev_income     int64
dtype: object


In [625]:
daily_click_stats_df['Date'] = pd.to_datetime(daily_click_stats_df['Date'], format='%m/%d/%Y')
daily_click_stats_df

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income
0,2021-01-01,1,4,71,77,40,898,395
1,2021-01-01,2,1,45,71,23,1143,351
2,2021-01-01,3,1,100,42,39,1769,1057
3,2021-01-01,4,2,72,51,57,1395,641
4,2021-01-01,5,3,0,87,26,1876,964
...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400
1540,2021-07-09,5,1,84,83,53,1144,116
1541,2021-07-09,6,4,53,36,22,1413,1637
1542,2021-07-09,7,1,106,61,23,888,1597


### Now the date has been converted to the correct date type, and the data is clensed enough to move onto the next section

In [626]:
print(daily_click_stats_df.dtypes)


Date          datetime64[ns]
site_id                int64
brand_id               int64
clicks                 int64
signups                int64
ftds                   int64
cpa_income             int64
rev_income             int64
dtype: object


# Step 2: Insert the following columns
1. Date (month)
2. site_name
3. brand_name
4. total_clicks (this has already been done, rename)
5. total_signups (this has already been done, rename)
6. total_ftds (this has already been done, rename)
7. total_cpa_income (this has already been done, rename)
8. total_revenue

In [627]:
# 1. Parse out month into a seperate column
daily_click_stats_df['month'] = pd.DatetimeIndex(daily_click_stats_df['Date']).month
#df['month'] = pd.DatetimeIndex(df['ArrivalDate']).month

daily_click_stats_df['month'] = daily_click_stats_df['month'].apply(lambda x: calendar.month_abbr[x])
daily_click_stats_df

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month
0,2021-01-01,1,4,71,77,40,898,395,Jan
1,2021-01-01,2,1,45,71,23,1143,351,Jan
2,2021-01-01,3,1,100,42,39,1769,1057,Jan
3,2021-01-01,4,2,72,51,57,1395,641,Jan
4,2021-01-01,5,3,0,87,26,1876,964,Jan
...,...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400,Jul
1540,2021-07-09,5,1,84,83,53,1144,116,Jul
1541,2021-07-09,6,4,53,36,22,1413,1637,Jul
1542,2021-07-09,7,1,106,61,23,888,1597,Jul


In [628]:
#2 join site name
daily_click_stats_df2 = pd.merge(daily_click_stats_df, site_df, how="inner", on="site_id")
daily_click_stats_df2

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,site_name
0,2021-01-01,1,4,71,77,40,898,395,Jan,195bet
1,2021-01-10,1,1,56,75,61,730,1709,Jan,195bet
2,2021-01-11,1,3,0,44,21,1782,809,Jan,195bet
3,2021-01-12,1,3,0,59,22,970,597,Jan,195bet
4,2021-01-13,1,4,67,67,32,1340,1528,Jan,195bet
...,...,...,...,...,...,...,...,...,...,...
1539,2021-07-05,8,1,47,35,16,1013,1248,Jul,natcasino.nu
1540,2021-07-06,8,4,111,73,62,181,1085,Jul,natcasino.nu
1541,2021-07-07,8,3,40,58,64,1603,1782,Jul,natcasino.nu
1542,2021-07-08,8,1,55,48,56,1591,1394,Jul,natcasino.nu


In [629]:
# 3 add brand name
daily_click_stats_df3 = pd.merge(daily_click_stats_df2, brand_df, how="inner", on="brand_id")
daily_click_stats_df3

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,site_name,brand_name
0,2021-01-01,1,4,71,77,40,898,395,Jan,195bet,Bertil
1,2021-01-13,1,4,67,67,32,1340,1528,Jan,195bet,Bertil
2,2021-01-14,1,4,37,83,17,447,300,Jan,195bet,Bertil
3,2021-01-19,1,4,87,84,67,734,525,Jan,195bet,Bertil
4,2021-01-20,1,4,86,46,46,626,289,Jan,195bet,Bertil
...,...,...,...,...,...,...,...,...,...,...,...
1539,2021-06-13,8,2,44,60,18,1645,727,Jun,natcasino.nu,Bet365
1540,2021-06-14,8,2,80,55,65,104,937,Jun,natcasino.nu,Bet365
1541,2021-06-07,8,2,85,35,27,1393,1241,Jun,natcasino.nu,Bet365
1542,2021-07-10,8,2,72,79,12,1821,1711,Jul,natcasino.nu,Bet365


In [630]:
# 3 add total revenue column
daily_click_stats_df3["total_revenue"] = daily_click_stats_df3['cpa_income'] + daily_click_stats_df3['rev_income']
daily_click_stats_df3


Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,site_name,brand_name,total_revenue
0,2021-01-01,1,4,71,77,40,898,395,Jan,195bet,Bertil,1293
1,2021-01-13,1,4,67,67,32,1340,1528,Jan,195bet,Bertil,2868
2,2021-01-14,1,4,37,83,17,447,300,Jan,195bet,Bertil,747
3,2021-01-19,1,4,87,84,67,734,525,Jan,195bet,Bertil,1259
4,2021-01-20,1,4,86,46,46,626,289,Jan,195bet,Bertil,915
...,...,...,...,...,...,...,...,...,...,...,...,...
1539,2021-06-13,8,2,44,60,18,1645,727,Jun,natcasino.nu,Bet365,2372
1540,2021-06-14,8,2,80,55,65,104,937,Jun,natcasino.nu,Bet365,1041
1541,2021-06-07,8,2,85,35,27,1393,1241,Jun,natcasino.nu,Bet365,2634
1542,2021-07-10,8,2,72,79,12,1821,1711,Jul,natcasino.nu,Bet365,3532


In [631]:
# reorder columns
daily_click_stats_df3 = daily_click_stats_df3[["site_id", "brand_id", "Date", "month", "site_name", "brand_name", "clicks", "signups", "ftds", "cpa_income", "rev_income","total_revenue"]]
daily_click_stats_df3

Unnamed: 0,site_id,brand_id,Date,month,site_name,brand_name,clicks,signups,ftds,cpa_income,rev_income,total_revenue
0,1,4,2021-01-01,Jan,195bet,Bertil,71,77,40,898,395,1293
1,1,4,2021-01-13,Jan,195bet,Bertil,67,67,32,1340,1528,2868
2,1,4,2021-01-14,Jan,195bet,Bertil,37,83,17,447,300,747
3,1,4,2021-01-19,Jan,195bet,Bertil,87,84,67,734,525,1259
4,1,4,2021-01-20,Jan,195bet,Bertil,86,46,46,626,289,915
...,...,...,...,...,...,...,...,...,...,...,...,...
1539,8,2,2021-06-13,Jun,natcasino.nu,Bet365,44,60,18,1645,727,2372
1540,8,2,2021-06-14,Jun,natcasino.nu,Bet365,80,55,65,104,937,1041
1541,8,2,2021-06-07,Jun,natcasino.nu,Bet365,85,35,27,1393,1241,2634
1542,8,2,2021-07-10,Jul,natcasino.nu,Bet365,72,79,12,1821,1711,3532


# Rename colunmn
daily_click_stats_df3 = daily_click_stats_df3.rename(columns={'clicks': 'total_clicks', 'signups': 'total_signups', 'ftds':'total_ftds', 'cpa_income':'total_cpa_income', 'rev_income': 'total_rev_income'    })

daily_click_stats_df3

# Step 3: Create a table showing the bottom 4 sites in terms of Total Revenue
- Incluede 4 columns (Rank, Site Name, Total Revenue)

In [632]:
bottom_four_revenue_df = daily_click_stats_df3[['month', 'site_name', 'total_revenue']]
bottom_four_revenue_df

Unnamed: 0,month,site_name,total_revenue
0,Jan,195bet,1293
1,Jan,195bet,2868
2,Jan,195bet,747
3,Jan,195bet,1259
4,Jan,195bet,915
...,...,...,...
1539,Jun,natcasino.nu,2372
1540,Jun,natcasino.nu,1041
1541,Jun,natcasino.nu,2634
1542,Jul,natcasino.nu,3532


In [633]:
bottom_four_revenue_df2 = bottom_four_revenue_df.groupby(['month', "site_name"])['total_revenue'].apply(lambda x : x.astype(int).sum())
#daily_click_stats_df = click_df.groupby(['Date', "site_id", "brand_id"])["clicks", "signups", "ftds", 'cpa_income', 'rev_income'].apply(lambda x : x.astype(int).sum())
bottom_four_revenue_df2


month  site_name          
Apr    195bet                 60777
       adriaticmarine.no      59534
       antek.se               63318
       arisitum.org           55500
       bingobrickan.se        63118
       casinoistockholm.se    58321
       grattage.com           58170
       natcasino.nu           57864
Feb    195bet                 58865
       adriaticmarine.no      63341
       antek.se               55431
       arisitum.org           61635
       bingobrickan.se        57096
       casinoistockholm.se    60956
       grattage.com           58987
       natcasino.nu           54528
Jan    195bet                 57964
       adriaticmarine.no      59223
       antek.se               61001
       arisitum.org           61956
       bingobrickan.se        55977
       casinoistockholm.se    60583
       grattage.com           53658
       natcasino.nu           67331
Jul    195bet                 24484
       adriaticmarine.no      25958
       antek.se               21529
 

In [634]:
# now group by the company name and sum the total revenue to get the yearly sum for each company
bottom_four_revenue_df3 = bottom_four_revenue_df.groupby(["site_name"])['total_revenue'].apply(lambda x : x.astype(int).sum()).nsmallest(4).reset_index()
#
#daily_click_stats_df = click_df.groupby(['Date', "site_id", "brand_id"])["clicks", "signups", "ftds", 'cpa_income', 'rev_income'].apply(lambda x : x.astype(int).sum())
bottom_four_revenue_df3


Unnamed: 0,site_name,total_revenue
0,grattage.com,379945
1,adriaticmarine.no,385964
2,bingobrickan.se,387691
3,casinoistockholm.se,388929


In [635]:
bottom_four_revenue_df3 = bottom_four_revenue_df3.sort_values(by='total_revenue', ascending=False)
bottom_four_revenue_df3

Unnamed: 0,site_name,total_revenue
3,casinoistockholm.se,388929
2,bingobrickan.se,387691
1,adriaticmarine.no,385964
0,grattage.com,379945


]

In [636]:
# Add Rank Column
idx = 0
new_col = [5, 6, 7, 8]
bottom_four_revenue_df3.insert(loc=idx, column='Rank', value=new_col)

## I left the Euro symbol out, this can be added back in on the API

In [637]:
bottom_four_revenue_df3 = bottom_four_revenue_df3.rename(columns={'site_name': 'Site Name', 'total_revenue': 'Total Revenue'})
bottom_four_revenue_df3

Unnamed: 0,Rank,Site Name,Total Revenue
3,5,casinoistockholm.se,388929
2,6,bingobrickan.se,387691
1,7,adriaticmarine.no,385964
0,8,grattage.com,379945



# Question 3: 
Assuming the table from part 1 has grown for a couple of years and contains more than 100 million rows, how would you approach adding an additional column to the aggregated table and populate it with data from a new source?  
The new source also has site_id and brand_id available  

# Answer
I have provided two soloutions, each of the soloutions differs by locking the table in production so no transacactions can occur while we add the new column. This decision would need to be based on how much impact locking the table would have on the company as none of the users could make transactions during this time. Since RakeTech uses click and transaction data across multiple websites, I would lean towards soloution #2, which has no downtime for the users. 

## Soloution 1

1. Lock the table against concurent changes in share mode, but allow reads.
2. Copy the original table. The COPY command is optimized for loading large numbers of rows; it is less flexible than INSERT, but incurs significantly less overhead for large data loads. It's fastest to drop indexes in this step.
3. Populate data
4. Add indexes with ALTER TABLE as final step

#### This soloution requires that we lock the table until the new table can be populated. The original table must be locked in order to prevent transactions from taking place and impacting data validity. This could be an issue depending on how long it takes to create the table

## Soloution 2

Use the Percona Toolkit library
**pt-online-schema-change** emulates the way that MySQL alters tables internally, but it works on a copy of the table you wish to alter. This means that the original table is not locked, and clients may continue to read and change data in it.

**pt-online-schema-change** works by creating an empty copy of the table to alter, modifying it as desired, and then copying rows from the original table into the new table. When the copy is complete, it moves away the original table and replaces it with the new one. By default, it also drops the original table.

##### Code Example

```python
pt-online-schema-change --alter "ADD COLUMN c1 INT" D=daily_clicks,t=invalid_user
print s
```
 


# Question 4

If wanting to split the first table not only on brand and site, but also on country and visitor device (desktop/mobile) after it’s been added to raw source data, how would you approach it to let us break down the data also on these parameters? Can you think of other interesting ways to break down the data in different ways? 
As in the previous task assuming the table has grown for a couple of years, and also that the first table is already aggregated from multiple sources where the mentioned data is now available.


In [638]:
daily_click_stats_df

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month
0,2021-01-01,1,4,71,77,40,898,395,Jan
1,2021-01-01,2,1,45,71,23,1143,351,Jan
2,2021-01-01,3,1,100,42,39,1769,1057,Jan
3,2021-01-01,4,2,72,51,57,1395,641,Jan
4,2021-01-01,5,3,0,87,26,1876,964,Jan
...,...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400,Jul
1540,2021-07-09,5,1,84,83,53,1144,116,Jul
1541,2021-07-09,6,4,53,36,22,1413,1637,Jul
1542,2021-07-09,7,1,106,61,23,888,1597,Jul


In [639]:
daily_click_stats_df_v2 = daily_click_stats_df.copy(deep=False)
daily_click_stats_df_v2

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month
0,2021-01-01,1,4,71,77,40,898,395,Jan
1,2021-01-01,2,1,45,71,23,1143,351,Jan
2,2021-01-01,3,1,100,42,39,1769,1057,Jan
3,2021-01-01,4,2,72,51,57,1395,641,Jan
4,2021-01-01,5,3,0,87,26,1876,964,Jan
...,...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400,Jul
1540,2021-07-09,5,1,84,83,53,1144,116,Jul
1541,2021-07-09,6,4,53,36,22,1413,1637,Jul
1542,2021-07-09,7,1,106,61,23,888,1597,Jul


In [640]:
# add user ID
daily_click_stats_df_v2['user_id'] =  np.random.randint(0,5000, size=len(daily_click_stats_df_v2))
daily_click_stats_df_v2

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,user_id
0,2021-01-01,1,4,71,77,40,898,395,Jan,217
1,2021-01-01,2,1,45,71,23,1143,351,Jan,1352
2,2021-01-01,3,1,100,42,39,1769,1057,Jan,1889
3,2021-01-01,4,2,72,51,57,1395,641,Jan,141
4,2021-01-01,5,3,0,87,26,1876,964,Jan,2567
...,...,...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400,Jul,4360
1540,2021-07-09,5,1,84,83,53,1144,116,Jul,4687
1541,2021-07-09,6,4,53,36,22,1413,1637,Jul,2463
1542,2021-07-09,7,1,106,61,23,888,1597,Jul,3627


In [641]:
# Add Device_id
daily_click_stats_df_v2['device_id'] =  np.random.randint(2, size=len(daily_click_stats_df_v2))
daily_click_stats_df_v2


Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,user_id,device_id
0,2021-01-01,1,4,71,77,40,898,395,Jan,217,0
1,2021-01-01,2,1,45,71,23,1143,351,Jan,1352,1
2,2021-01-01,3,1,100,42,39,1769,1057,Jan,1889,1
3,2021-01-01,4,2,72,51,57,1395,641,Jan,141,1
4,2021-01-01,5,3,0,87,26,1876,964,Jan,2567,0
...,...,...,...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400,Jul,4360,1
1540,2021-07-09,5,1,84,83,53,1144,116,Jul,4687,0
1541,2021-07-09,6,4,53,36,22,1413,1637,Jul,2463,1
1542,2021-07-09,7,1,106,61,23,888,1597,Jul,3627,0


In [642]:
daily_click_stats_df_v2['country_id'] =  np.random.randint(11, size=len(daily_click_stats_df_v2))
daily_click_stats_df_v2

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,user_id,device_id,country_id
0,2021-01-01,1,4,71,77,40,898,395,Jan,217,0,0
1,2021-01-01,2,1,45,71,23,1143,351,Jan,1352,1,5
2,2021-01-01,3,1,100,42,39,1769,1057,Jan,1889,1,1
3,2021-01-01,4,2,72,51,57,1395,641,Jan,141,1,2
4,2021-01-01,5,3,0,87,26,1876,964,Jan,2567,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1539,2021-07-09,4,4,34,67,44,1926,400,Jul,4360,1,3
1540,2021-07-09,5,1,84,83,53,1144,116,Jul,4687,0,8
1541,2021-07-09,6,4,53,36,22,1413,1637,Jul,2463,1,9
1542,2021-07-09,7,1,106,61,23,888,1597,Jul,3627,0,1


In [643]:
device_df = pd.Series({"laptop": [0], "desktop": [1]})
device_df

laptop     [0]
desktop    [1]
dtype: object

In [644]:
device_df = pd.DataFrame({"device": ["laptop", "desktop"], "device_id": [0, 1]})
device_df

Unnamed: 0,device,device_id
0,laptop,0
1,desktop,1


In [645]:
country_df =  pd.DataFrame({"country": ["Malta", "Spain", "France", "Poland", "Switzerland", "Sweeden", "Portugal", "Italy", "Romania", "Norway"],
     "country_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

country_df


Unnamed: 0,country,country_id
0,Malta,1
1,Spain,2
2,France,3
3,Poland,4
4,Switzerland,5
5,Sweeden,6
6,Portugal,7
7,Italy,8
8,Romania,9
9,Norway,10


In [646]:
daily_click_stats_df_v3 = pd.merge(daily_click_stats_df_v2, country_df, how="inner", on="country_id")
daily_click_stats_df_v3

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,user_id,device_id,country_id,country
0,2021-01-01,2,1,45,71,23,1143,351,Jan,1352,1,5,Switzerland
1,2021-01-01,7,4,37,56,21,638,564,Jan,4744,1,5,Switzerland
2,2021-01-01,8,1,35,68,36,912,1155,Jan,3696,1,5,Switzerland
3,2021-01-10,1,1,56,75,61,730,1709,Jan,1816,0,5,Switzerland
4,2021-01-10,6,4,88,82,16,1642,894,Jan,3266,0,5,Switzerland
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,2021-07-05,4,3,79,63,53,1825,1694,Jul,4858,0,8,Italy
1369,2021-07-06,3,4,86,41,51,210,255,Jul,553,1,8,Italy
1370,2021-07-06,8,4,111,73,62,181,1085,Jul,3686,0,8,Italy
1371,2021-07-09,2,2,58,72,31,1267,1371,Jul,1383,0,8,Italy


In [647]:
daily_click_stats_df_v3 = pd.merge(daily_click_stats_df_v3, device_df, how="inner", on="device_id")
daily_click_stats_df_v3

Unnamed: 0,Date,site_id,brand_id,clicks,signups,ftds,cpa_income,rev_income,month,user_id,device_id,country_id,country,device
0,2021-01-01,2,1,45,71,23,1143,351,Jan,1352,1,5,Switzerland,desktop
1,2021-01-01,7,4,37,56,21,638,564,Jan,4744,1,5,Switzerland,desktop
2,2021-01-01,8,1,35,68,36,912,1155,Jan,3696,1,5,Switzerland,desktop
3,2021-01-11,7,4,89,40,27,950,1435,Jan,930,1,5,Switzerland,desktop
4,2021-01-12,3,3,0,81,65,1813,1377,Jan,231,1,5,Switzerland,desktop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,2021-07-04,7,3,53,78,37,1729,277,Jul,2742,0,8,Italy,laptop
1369,2021-07-05,4,3,79,63,53,1825,1694,Jul,4858,0,8,Italy,laptop
1370,2021-07-06,8,4,111,73,62,181,1085,Jul,3686,0,8,Italy,laptop
1371,2021-07-09,2,2,58,72,31,1267,1371,Jul,1383,0,8,Italy,laptop


In [652]:
# Which country has the most sign ups?
daily_click_stats_df_v4 = daily_click_stats_df_v3[["month", "country", "signups", "cpa_income", "rev_income"]]
daily_click_stats_df_v4

Unnamed: 0,month,country,signups,cpa_income,rev_income
0,Jan,Switzerland,71,1143,351
1,Jan,Switzerland,56,638,564
2,Jan,Switzerland,68,912,1155
3,Jan,Switzerland,40,950,1435
4,Jan,Switzerland,81,1813,1377
...,...,...,...,...,...
1368,Jul,Italy,78,1729,277
1369,Jul,Italy,63,1825,1694
1370,Jul,Italy,73,181,1085
1371,Jul,Italy,72,1267,1371


In [654]:
country_stats_df = daily_click_stats_df_v4.groupby(['month', "country"])['signups', 'cpa_income', 'rev_income'].apply(lambda x : x.astype(int).sum())
country_stats_df

  country_stats_df = daily_click_stats_df_v4.groupby(['month', "country"])['signups', 'cpa_income', 'rev_income'].apply(lambda x : x.astype(int).sum())


Unnamed: 0_level_0,Unnamed: 1_level_0,signups,cpa_income,rev_income
month,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Apr,France,1011,20957,18202
Apr,Italy,1205,14636,20720
Apr,Malta,1147,16869,14582
Apr,Norway,1698,27756,22632
Apr,Poland,1075,15919,16686
...,...,...,...,...
May,Portugal,1071,18958,16798
May,Romania,1327,21811,26017
May,Spain,1259,19354,17863
May,Sweeden,999,18207,15529


In [656]:
country_stats_df = country_stats_df.reset_index()
country_stats_df

Unnamed: 0,month,country,signups,cpa_income,rev_income
0,Apr,France,1011,20957,18202
1,Apr,Italy,1205,14636,20720
2,Apr,Malta,1147,16869,14582
3,Apr,Norway,1698,27756,22632
4,Apr,Poland,1075,15919,16686
...,...,...,...,...,...
65,May,Portugal,1071,18958,16798
66,May,Romania,1327,21811,26017
67,May,Spain,1259,19354,17863
68,May,Sweeden,999,18207,15529


In [657]:
# how much is earned in each country by sign up?
country_stats_df["total_revenue"] = country_stats_df['cpa_income'] + country_stats_df['rev_income']

country_stats_df

Unnamed: 0,month,country,signups,cpa_income,rev_income,total_revenue
0,Apr,France,1011,20957,18202,39159
1,Apr,Italy,1205,14636,20720,35356
2,Apr,Malta,1147,16869,14582,31451
3,Apr,Norway,1698,27756,22632,50388
4,Apr,Poland,1075,15919,16686,32605
...,...,...,...,...,...,...
65,May,Portugal,1071,18958,16798,35756
66,May,Romania,1327,21811,26017,47828
67,May,Spain,1259,19354,17863,37217
68,May,Sweeden,999,18207,15529,33736


In [659]:
country_stats_df2 = country_stats_df[["month", "country", "signups", "total_revenue"]]
country_stats_df2

Unnamed: 0,month,country,signups,total_revenue
0,Apr,France,1011,39159
1,Apr,Italy,1205,35356
2,Apr,Malta,1147,31451
3,Apr,Norway,1698,50388
4,Apr,Poland,1075,32605
...,...,...,...,...
65,May,Portugal,1071,35756
66,May,Romania,1327,47828
67,May,Spain,1259,37217
68,May,Sweeden,999,33736


In [661]:
country_stats_df2["revenue per signup"] = country_stats_df2["total_revenue"] / country_stats_df2["signups"]
country_stats_df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_stats_df2["revenue per signup"] = country_stats_df2["total_revenue"] / country_stats_df2["signups"]


Unnamed: 0,month,country,signups,total_revenue,revenue per signup
0,Apr,France,1011,39159,38.73
1,Apr,Italy,1205,35356,29.34
2,Apr,Malta,1147,31451,27.42
3,Apr,Norway,1698,50388,29.67
4,Apr,Poland,1075,32605,30.33
...,...,...,...,...,...
65,May,Portugal,1071,35756,33.39
66,May,Romania,1327,47828,36.04
67,May,Spain,1259,37217,29.56
68,May,Sweeden,999,33736,33.77



# Countries that spent the most per signup

This table shows the top earners for each sign-up. This table can tell us which countries spend the most, which is probably a good place to run ad-campaigns. Additionally, we can see who spent the most in which month. For instance, consumers in Spain may spend more in the winter months, while those in Switzerland are buys skiing, and spend very little. Since I used a random number generator, these results will vary.

In [672]:
#For each month, list the top country for revenue per signup
country_stats_df2.nlargest(10, 'revenue per signup', keep='last')

Unnamed: 0,month,country,signups,total_revenue,revenue per signup
39,Jul,Switzerland,324,13155,40.6
30,Jul,France,648,26172,40.39
54,Mar,Poland,979,39345,40.19
5,Apr,Portugal,1135,44524,39.23
7,Apr,Spain,1361,52805,38.8
0,Apr,France,1011,39159,38.73
18,Feb,Sweeden,1024,39337,38.42
50,Mar,France,806,30453,37.78
17,Feb,Spain,957,35498,37.09
35,Jul,Portugal,475,17583,37.02
