In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os
from pandasql import sqldf

from scipy.stats import pearsonr
from datetime import datetime

makeQuery = lambda q: sqldf(q, globals())

In [2]:
hcps = pd.read_csv('./data/data_files/hcps.csv')
activity = pd.read_csv('./data/data_files/activity.csv')
salesTrain = pd.read_csv('./data/data_files/sales_train.csv')
rtes = pd.read_csv('./data/data_files/rtes_cleaned.csv', index_col=0)

### Time based rte analysis

In [3]:
rtes.head()

Unnamed: 0,hcp,specialty,region,brand,email_type,content_id,no. openings,no. clicks,time_sent,time_last_opened,time_last_clicked
0,58555,General practicioner,region_0,brand_1,general,a3D6f0000019jMvEAI,1,0,2021-03-02 08:42:00,2021-03-02 08:45:00,
1,819,General practicioner,region_1,brand_2,product_related,a3D6f0000019gA2EAI,0,0,2020-07-16 11:58:00,,
2,819,General practicioner,region_1,brand_2,general,a3D6f0000019jSnEAI,1,0,2020-08-17 14:47:00,2020-08-17 23:16:00,
3,819,General practicioner,region_1,brand_1,product_related,a3D6f000000MtZ3EAK,0,0,2020-09-03 16:08:00,,
4,819,General practicioner,region_1,brand_1,general,a3D6f000000N0nhEAC,1,0,2020-10-16 13:35:00,2020-10-16 22:39:00,


In [4]:
for timeColumn in ['time_sent', 'time_last_opened', 'time_last_clicked']: 
    rtes[timeColumn] = pd.to_datetime(rtes[timeColumn])

In [5]:
rtes['month_sent'] = rtes['time_sent'].apply(lambda x: '{}-{}'.format(x.year, '0'+str(x.month) if x.month < 10 else x.month))
rtes['month_opened'] = rtes['time_last_opened'].apply(lambda x: '{}-{}'.format(x.year, '0'+str(x.month) if x.month < 10 else x.month))
rtes['month_clicked'] = rtes['time_last_clicked'].apply(lambda x: '{}-{}'.format(x.year, '0'+str(x.month) if x.month < 10 else x.month))
rtes.head()

Unnamed: 0,hcp,specialty,region,brand,email_type,content_id,no. openings,no. clicks,time_sent,time_last_opened,time_last_clicked,month_sent,month_opened,month_clicked
0,58555,General practicioner,region_0,brand_1,general,a3D6f0000019jMvEAI,1,0,2021-03-02 08:42:00,2021-03-02 08:45:00,NaT,2021-03,2021-03,nan-nan
1,819,General practicioner,region_1,brand_2,product_related,a3D6f0000019gA2EAI,0,0,2020-07-16 11:58:00,NaT,NaT,2020-07,nan-nan,nan-nan
2,819,General practicioner,region_1,brand_2,general,a3D6f0000019jSnEAI,1,0,2020-08-17 14:47:00,2020-08-17 23:16:00,NaT,2020-08,2020-08,nan-nan
3,819,General practicioner,region_1,brand_1,product_related,a3D6f000000MtZ3EAK,0,0,2020-09-03 16:08:00,NaT,NaT,2020-09,nan-nan,nan-nan
4,819,General practicioner,region_1,brand_1,general,a3D6f000000N0nhEAC,1,0,2020-10-16 13:35:00,2020-10-16 22:39:00,NaT,2020-10,2020-10,nan-nan


In [16]:
brand = 'brand_1'
query = """
    SELECT 
        rtes.region,
        month_sent,
        tier,
        COUNT(rtes.hcp) AS nr_contacts, 
        COUNT(DISTINCT rtes.hcp) AS hcp_contacted, 
        COUNT(month_opened) AS mails_opened, 
        COUNT(month_clicked) AS mails_clicked
        
    FROM rtes
    INNER JOIN hcps
    ON hcps.hcp = rtes.hcp
    WHERE brand = '{}'
    GROUP BY rtes.region, month_sent, tier
    
""".format(brand)
rtesMonthly = makeQuery(query)
rtesMonthly.head()

Unnamed: 0,region,month_sent,tier,nr_contacts,hcp_contacted,mails_opened,mails_clicked
0,region_0,2021-03,2,1,1,1,1
1,region_1,2020-09,1,10,5,10,10
2,region_1,2020-09,2,42,22,42,42
3,region_1,2020-10,2,1,1,1,1
4,region_1,2020-11,2,3,3,3,3


In [17]:
rtesMonthly.shape

(2100, 7)

In [18]:
rteTier = []
columns = ['nr_contacts', 'hcp_contacted', 'mails_opened', 'mails_clicked']
regions = ['region_{}'.format(i) for i in range(201)]
months = set([datetime.strptime(m, '%Y-%m') for m in salesTrain.month.values])
months = list(months)
months.sort()
months = ['{}-{}'.format(d.year, '0'+str(d.month) if d.month < 10 else d.month) for d in months]
for region in regions: 
    regionDf = rtesMonthly[rtesMonthly.region == region]
    for month in months: 
        monthDf = regionDf[regionDf.month_sent == month]
        row = [region, month]
        
        if monthDf.shape[0] == 0: 
            row.extend([0, 0, 0, 0, 0, 0, 0, 0])
            rteTier.append(row)
            continue 
            
        for tier in [1, 2]: 
            tierDf = monthDf[monthDf.tier == tier]
            if tierDf.shape[0] == 0: 
                row.extend([0, 0, 0, 0])
            else: 
                row.extend(tierDf[columns].values[0].tolist())
        rteTier.append(row)
            
rteTierColumns = ['region', 'month_sent']
for tier in [1, 2]: 
    rteTierColumns.extend(['{}_{}'.format(c, tier) for c in columns])
    
rteTier = pd.DataFrame(rteTier, columns=rteTierColumns)
rteTier.shape

(4020, 10)

In [19]:
rteTier.head(n=10)

Unnamed: 0,region,month_sent,nr_contacts_1,hcp_contacted_1,mails_opened_1,mails_clicked_1,nr_contacts_2,hcp_contacted_2,mails_opened_2,mails_clicked_2
0,region_0,2020-01,0,0,0,0,0,0,0,0
1,region_0,2020-02,0,0,0,0,0,0,0,0
2,region_0,2020-03,0,0,0,0,0,0,0,0
3,region_0,2020-04,0,0,0,0,0,0,0,0
4,region_0,2020-05,0,0,0,0,0,0,0,0
5,region_0,2020-06,0,0,0,0,0,0,0,0
6,region_0,2020-07,0,0,0,0,0,0,0,0
7,region_0,2020-08,0,0,0,0,0,0,0,0
8,region_0,2020-09,0,0,0,0,0,0,0,0
9,region_0,2020-10,0,0,0,0,0,0,0,0


In [20]:
rteTier.fillna(0, inplace=True)
rteTier.region = pd.Categorical(rteTier.region, regions)
rteTier.rename(columns={'month_sent': 'month'}, inplace=True)
rteTier.month = pd.Categorical(rteTier.month, months)

In [21]:
rteTier.sort_values(['region', 'month'], ascending=True, inplace=True)
rteTier.head(n=25)

Unnamed: 0,region,month,nr_contacts_1,hcp_contacted_1,mails_opened_1,mails_clicked_1,nr_contacts_2,hcp_contacted_2,mails_opened_2,mails_clicked_2
0,region_0,2020-01,0,0,0,0,0,0,0,0
1,region_0,2020-02,0,0,0,0,0,0,0,0
2,region_0,2020-03,0,0,0,0,0,0,0,0
3,region_0,2020-04,0,0,0,0,0,0,0,0
4,region_0,2020-05,0,0,0,0,0,0,0,0
5,region_0,2020-06,0,0,0,0,0,0,0,0
6,region_0,2020-07,0,0,0,0,0,0,0,0
7,region_0,2020-08,0,0,0,0,0,0,0,0
8,region_0,2020-09,0,0,0,0,0,0,0,0
9,region_0,2020-10,0,0,0,0,0,0,0,0


In [22]:
deltas = []
for region in regions: 
    regionDf = rteTier[rteTier.region == region].copy()
    firstIndex = regionDf.index.values[0]
    for c in regionDf.columns:  
        if c in ['region', 'month']: 
            continue
    
        regionDf['delta_{}'.format(c)] = regionDf[c].diff()
        regionDf.at[firstIndex, 'delta_{}'.format(c)] = 0
    deltas.append(regionDf)
deltas = pd.concat(deltas, axis=0)
deltas.head(n=25)

Unnamed: 0,region,month,nr_contacts_1,hcp_contacted_1,mails_opened_1,mails_clicked_1,nr_contacts_2,hcp_contacted_2,mails_opened_2,mails_clicked_2,delta_nr_contacts_1,delta_hcp_contacted_1,delta_mails_opened_1,delta_mails_clicked_1,delta_nr_contacts_2,delta_hcp_contacted_2,delta_mails_opened_2,delta_mails_clicked_2
0,region_0,2020-01,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,region_0,2020-02,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,region_0,2020-03,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,region_0,2020-04,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,region_0,2020-05,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,region_0,2020-06,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,region_0,2020-07,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,region_0,2020-08,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,region_0,2020-09,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,region_0,2020-10,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
deltas.drop([c for c in rteTier.columns if c not in ['region', 'month']], axis=1, inplace=True)
deltas.head()

Unnamed: 0,region,month,delta_nr_contacts_1,delta_hcp_contacted_1,delta_mails_opened_1,delta_mails_clicked_1,delta_nr_contacts_2,delta_hcp_contacted_2,delta_mails_opened_2,delta_mails_clicked_2
0,region_0,2020-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,region_0,2020-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,region_0,2020-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,region_0,2020-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,region_0,2020-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
np.any(deltas.isna().values)

False

In [25]:
deltas.to_csv('./data/data_files/time_features/rte_delta_{}.csv'.format(brand))

### Time based activity analysis

In [26]:
activity.head()

Unnamed: 0,hcp,specialty,region,brand,month,channel,count
0,39972,Internal medicine / pneumology,region_61,brand_1,2020-05,video,1
1,64026,General practicioner,region_168,brand_2,2020-05,phone,1
2,27892,Internal medicine,region_26,brand_2,2020-06,phone,1
3,33500,General practicioner,region_26,brand_2,2020-06,phone,1
4,61239,General practicioner,region_26,brand_2,2020-06,phone,1


In [34]:
brand = 'brand_2'
query = """
SELECT 
    activity.region,
    hcps.tier,  
    month, 
    channel,
    -- hcps.hcp, 
    SUM(count) AS interactions, 
    COUNT(DISTINCT hcps.hcp) AS distinct_interactions
FROM activity
INNER JOIN hcps 
ON hcps.hcp = activity.hcp
WHERE brand = '{}'
GROUP BY activity.region, month, tier, channel
""".format(brand)
df = makeQuery(query)
df.head(n=25)

Unnamed: 0,region,tier,month,channel,interactions,distinct_interactions
0,region_0,1,2020-07,f2f,10,9
1,region_0,1,2020-07,phone,1,1
2,region_0,1,2020-08,f2f,23,12
3,region_0,1,2020-08,phone,3,3
4,region_0,2,2020-08,f2f,4,4
5,region_0,1,2020-09,f2f,18,10
6,region_0,1,2020-09,phone,3,3
7,region_0,2,2020-09,f2f,1,1
8,region_0,2,2020-09,phone,2,2
9,region_0,1,2020-10,f2f,2,2


In [35]:
activityTier = []
columns = ['interactions', 'distinct_interactions']
regions = ['region_{}'.format(i) for i in range(201)]
channels = list(set(df.channel.values.tolist()))
tiers = [1, 2]
months = set([datetime.strptime(m, '%Y-%m') for m in salesTrain.month.values])
months = list(months)
months.sort()
months = ['{}-{}'.format(d.year, '0'+str(d.month) if d.month < 10 else d.month) for d in months]
for region in regions: 
    regionDf = df[df.region == region]
    for month in months: 
        monthDf = regionDf[regionDf.month == month]
        row = [region, month]
        
        if monthDf.shape[0] == 0: 
            row.extend([0]*len(tiers)*len(channels)*len(columns))
            activityTier.append(row)
            continue 
            
        for tier in tiers: 
            tierDf = monthDf[monthDf.tier == tier]
            if tierDf.shape[0] == 0: 
                row.extend([0]*len(channels) * len(columns))
                continue
            
            for channel in channels: 
                channelDf = tierDf[tierDf.channel == channel]

                if channelDf.shape[0] == 0: 
                    row.extend([0] * len(columns))
                else: 
                    row.extend(channelDf[columns].values[0].tolist())
                    
        activityTier.append(row)
            
activityTierColumns = ['region', 'month']
for tier in [1, 2]: 
    for channel in channels: 
        activityTierColumns.extend(['{}_{}_{}'.format(c, tier, channel) for c in columns])
    
activityTier = pd.DataFrame(activityTier, columns=activityTierColumns)
activityTier.shape

(4020, 18)

In [36]:
activityTier.head(n=25)

Unnamed: 0,region,month,interactions_1_f2f,distinct_interactions_1_f2f,interactions_1_other,distinct_interactions_1_other,interactions_1_phone,distinct_interactions_1_phone,interactions_1_video,distinct_interactions_1_video,interactions_2_f2f,distinct_interactions_2_f2f,interactions_2_other,distinct_interactions_2_other,interactions_2_phone,distinct_interactions_2_phone,interactions_2_video,distinct_interactions_2_video
0,region_0,2020-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,region_0,2020-02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,region_0,2020-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,region_0,2020-04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,region_0,2020-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,region_0,2020-06,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,region_0,2020-07,10,9,0,0,1,1,0,0,0,0,0,0,0,0,0,0
7,region_0,2020-08,23,12,0,0,3,3,0,0,4,4,0,0,0,0,0,0
8,region_0,2020-09,18,10,0,0,3,3,0,0,1,1,0,0,2,2,0,0
9,region_0,2020-10,2,2,0,0,0,0,0,0,1,1,0,0,0,0,0,0


In [37]:
deltas = []
for region in regions: 
    regionDf = activityTier[activityTier.region == region].copy()
    firstIndex = regionDf.index.values[0]
    for c in regionDf.columns:  
        if c in ['region', 'month']: 
            continue
    
        regionDf['delta_{}'.format(c)] = regionDf[c].diff()
        regionDf.at[firstIndex, 'delta_{}'.format(c)] = 0
    deltas.append(regionDf)
deltas = pd.concat(deltas, axis=0)
deltas.head()

Unnamed: 0,region,month,interactions_1_f2f,distinct_interactions_1_f2f,interactions_1_other,distinct_interactions_1_other,interactions_1_phone,distinct_interactions_1_phone,interactions_1_video,distinct_interactions_1_video,...,delta_interactions_1_video,delta_distinct_interactions_1_video,delta_interactions_2_f2f,delta_distinct_interactions_2_f2f,delta_interactions_2_other,delta_distinct_interactions_2_other,delta_interactions_2_phone,delta_distinct_interactions_2_phone,delta_interactions_2_video,delta_distinct_interactions_2_video
0,region_0,2020-01,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,region_0,2020-02,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,region_0,2020-03,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,region_0,2020-04,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,region_0,2020-05,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
deltas.drop([c for c in activityTier.columns if c not in ['region', 'month']], axis=1, inplace=True)
deltas.head(n=25)

Unnamed: 0,region,month,delta_interactions_1_f2f,delta_distinct_interactions_1_f2f,delta_interactions_1_other,delta_distinct_interactions_1_other,delta_interactions_1_phone,delta_distinct_interactions_1_phone,delta_interactions_1_video,delta_distinct_interactions_1_video,delta_interactions_2_f2f,delta_distinct_interactions_2_f2f,delta_interactions_2_other,delta_distinct_interactions_2_other,delta_interactions_2_phone,delta_distinct_interactions_2_phone,delta_interactions_2_video,delta_distinct_interactions_2_video
0,region_0,2020-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,region_0,2020-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,region_0,2020-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,region_0,2020-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,region_0,2020-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,region_0,2020-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,region_0,2020-07,10.0,9.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,region_0,2020-08,13.0,3.0,0.0,0.0,2.0,2.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
8,region_0,2020-09,-5.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,-3.0,0.0,0.0,2.0,2.0,0.0,0.0
9,region_0,2020-10,-16.0,-8.0,0.0,0.0,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,0.0,0.0


In [39]:
np.any(deltas.isna().values)

False

In [40]:
deltas.to_csv('./data/data_files/time_features/activity_delta_{}.csv'.format(brand))

### Time-based sales analysis
get last sales, and relative change. also add market potential

In [86]:
salesTrain = pd.read_csv('./data/data_files/sales_train_splitted.csv', index_col=0)
salesTrain.head(n=25)

Unnamed: 0,month,region,brand,sales
0,2020-01,region_0,brand_1,0.0
1,2020-01,region_0,brand_2,0.0
2,2020-01,region_0,brand_3,65007.49
3,2020-01,region_0,brand_12_market,509023.69
4,2020-01,region_0,brand_3_market,940469.05
5,2020-01,region_1,brand_1,0.0
6,2020-01,region_1,brand_2,0.0
7,2020-01,region_1,brand_3,45929.88
8,2020-01,region_1,brand_12_market,344182.42
9,2020-01,region_1,brand_3_market,625300.1


In [87]:
salesTest = pd.read_csv('./data/data_files/sales_test_splitted.csv', index_col=0)
salesTest.head()

Unnamed: 0,month,region,brand,sales
755,2020-01,region_151,brand_3,35347.3
756,2020-01,region_151,brand_12_market,213189.25
757,2020-01,region_151,brand_3_market,423435.51
758,2020-01,region_152,brand_3,34091.04
759,2020-01,region_152,brand_12_market,188121.51


In [90]:
sales = pd.concat([salesTrain, salesTest], axis=0)
sales.region = pd.Categorical(sales.region, regions)
sales.month = pd.Categorical(sales.month, months)
sales.sort_values(['region', 'month'], ascending=True, inplace=True)
sales.reset_index(drop=True, inplace=True)
sales.head(n=25)

Unnamed: 0,month,region,brand,sales
0,2020-01,region_0,brand_1,0.0
1,2020-01,region_0,brand_2,0.0
2,2020-01,region_0,brand_3,65007.49
3,2020-01,region_0,brand_12_market,509023.69
4,2020-01,region_0,brand_3_market,940469.05
5,2020-02,region_0,brand_1,0.0
6,2020-02,region_0,brand_2,0.0
7,2020-02,region_0,brand_3,51971.16
8,2020-02,region_0,brand_12_market,458929.0
9,2020-02,region_0,brand_3_market,826222.47


In [91]:
sales.tail()

Unnamed: 0,month,region,brand,sales
18095,2021-07,region_200,brand_12_market,410435.21
18096,2021-07,region_200,brand_3_market,715514.89
18097,2021-08,region_200,brand_3,50417.07
18098,2021-08,region_200,brand_12_market,289230.73
18099,2021-08,region_200,brand_3_market,509048.09


In [113]:
brand = 'brand_2'
salesSub = sales[sales['brand'] == brand]
salesSub.reset_index(drop=True, inplace=True)
salesSub.head(n=10)

Unnamed: 0,month,region,brand,sales
0,2020-01,region_0,brand_2,0.0
1,2020-02,region_0,brand_2,0.0
2,2020-03,region_0,brand_2,0.0
3,2020-04,region_0,brand_2,0.0
4,2020-05,region_0,brand_2,0.0
5,2020-06,region_0,brand_2,0.0
6,2020-07,region_0,brand_2,0.0
7,2020-08,region_0,brand_2,253.22
8,2020-09,region_0,brand_2,56.27
9,2020-10,region_0,brand_2,480.06


In [114]:
marketPotential = sales[sales['brand'] == 'brand_12_market']
marketPotential.reset_index(drop=True, inplace=True)
marketPotential.head()

Unnamed: 0,month,region,brand,sales
0,2020-01,region_0,brand_12_market,509023.69
1,2020-02,region_0,brand_12_market,458929.0
2,2020-03,region_0,brand_12_market,477955.06
3,2020-04,region_0,brand_12_market,496981.12
4,2020-05,region_0,brand_12_market,473531.38


In [115]:
salesDiff = []
for region in regions[:151]: 
    regionDf = salesSub[salesSub.region == region][['region', 'month', 'sales']].copy()
    firstIndex = regionDf.index.values[0]
    regionDf['delta_sales'] = regionDf['sales'].diff()
    regionDf['rel_delta_sales'] = regionDf['sales'].pct_change()
    infIndex = regionDf[regionDf['rel_delta_sales'] == np.inf].index.values
    if infIndex.shape[0] > 0: 
        regionDf.loc[infIndex, 'rel_delta_sales'] = 100
    regionDf.at[firstIndex, 'delta_sales'] = 0
    naIndex = regionDf[regionDf['rel_delta_sales'].isna()].index.values
    if naIndex.shape[0] > 0: 
        regionDf.loc[naIndex, 'rel_delta_sales'] = 0
    regionDf['prevSales'] = regionDf['sales'].shift()
    regionDf.at[firstIndex, 'prevSales'] = 0
    # regionDf.fillna(0, inplace=True)
    salesDiff.append(regionDf)
salesDiff = pd.concat(salesDiff, axis=0)
salesDiff.head(n=10)

Unnamed: 0,region,month,sales,delta_sales,rel_delta_sales,prevSales
0,region_0,2020-01,0.0,0.0,0.0,0.0
1,region_0,2020-02,0.0,0.0,0.0,0.0
2,region_0,2020-03,0.0,0.0,0.0,0.0
3,region_0,2020-04,0.0,0.0,0.0,0.0
4,region_0,2020-05,0.0,0.0,0.0,0.0
5,region_0,2020-06,0.0,0.0,0.0,0.0
6,region_0,2020-07,0.0,0.0,0.0,0.0
7,region_0,2020-08,253.22,253.22,100.0,0.0
8,region_0,2020-09,56.27,-196.95,-0.777782,253.22
9,region_0,2020-10,480.06,423.79,7.531367,56.27


In [116]:
salesDiff.shape, marketPotential.shape

((3020, 6), (4020, 4))

In [117]:
salesDiff.to_csv('./data/data_files/time_features/sales_delta_{}_train.csv'.format(brand))

In [112]:
marketPotential.to_csv('./data/data_files/time_features/market_potential_12.csv')

### Merge datasets into training and test data

In [135]:
brands = ['brand_1', 'brand_2']
marketPotential = pd.read_csv('./data/data_files/time_features/market_potential_12.csv', index_col=0)
trainRegions = regions[:151]
testRegions = regions[151:]
for brand in brands: 
    activityDelta = pd.read_csv('./data/data_files/time_features/activity_delta_{}.csv'.format(brand), index_col=0)
    rteDelta = pd.read_csv('./data/data_files/time_features/rte_delta_{}.csv'.format(brand), index_col=0)
    
    salesDelta = pd.read_csv('./data/data_files/time_features/sales_delta_{}_train.csv'.format(brand), index_col=0)
    
    data = pd.concat([activityDelta, rteDelta.drop(['region', 'month'], axis=1)], axis=1)
    dataTrain = data[data['region'].apply(lambda x: x in trainRegions)]
    dataTest = data[data['region'].apply(lambda x: x in testRegions)]
    potentialTrain = marketPotential[marketPotential['region'].apply(lambda x: x in trainRegions)]
    potentialTest = marketPotential[marketPotential['region'].apply(lambda x: x in testRegions)]
    #dataTrain = pd.concat([dataTrain, salesDelta['prevSales'], potentialTrain], axis=0)
    dataTrain = pd.concat([dataTrain, potentialTrain.rename(columns={'sales': 'market_potential'})['market_potential']], axis=1)
    dataTest = pd.concat([dataTest, potentialTest.rename(columns={'sales': 'market_potential'})['market_potential']], axis=1)
    print(dataTrain.shape, dataTest.shape)
    
    dataTrain.to_csv('./data/data_files/time_features/time_features_{}_train.csv'.format(brand))
    dataTest.to_csv('./data/data_files/time_features/time_features_{}_test.csv'.format(brand))

(3020, 27) (1000, 27)
(3020, 27) (1000, 27)
