In [1]:
import pandas as pd
import numpy as np
import datetime

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the preprocessed dataframes
train_raw_df = pd.read_csv("(3)preprocessed_train.csv",
    dtype={'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("(3)preprocessed_test.csv",
    dtype={'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((1708337, 37), (401589, 37))

### Extract the date from VisitStartTime

In [3]:
def extract_date(df):
    df['full_date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['date'] = df['full_date'].dt.date
    df.drop(['full_date'], axis=1, inplace=True)
    return df

In [4]:
train_df = extract_date(train_raw_df)
test_df = extract_date(test_raw_df)
train_df.shape, test_df.shape

((1708337, 38), (401589, 38))

### Concatenate train and test to extract the labels

In [5]:
total_df = pd.concat([train_df, test_df])
total_df.shape

(2109926, 38)

### Core functions definition

In [6]:
# Create a dataframe containing only couples of fullvisitorid and totaltransactionrevenue when the latter is not null
def zip_df_on_revenue(df):
    fullID = []
    not_null_transactions = []
    for row in df.itertuples():
        if(row.totals_transactionRevenue > 0):
            fullID.append(row.fullVisitorId)
            not_null_transactions.append(row.totals_transactionRevenue)
    temp_df = pd.DataFrame({'fullVisitorId': fullID})
    not_null_transactions_temp_df = pd.DataFrame({'totals_transactionRevenue': not_null_transactions})
    temp_df = temp_df.join(not_null_transactions_temp_df)
    return temp_df

In [7]:
def compute_label(df, id_label):
    label = []

    for user in df['fullVisitorId'].values:
        temp = 0
        if(user in id_label.fullVisitorId.values):
            for record in id_label.itertuples():
                if (record.fullVisitorId == user):
                    temp += record.totals_transactionRevenue
        label.append(temp)
    return label

## Sliding window version (no overlapping)

In [8]:
train_start_date = min(train_df.date)
train_end_date = max(train_df.date)

train_span = (train_end_date - train_start_date).days + 1

print("The first date in the train set is %s" % train_start_date)
print("The last date in the train set is %s" % train_end_date)
print("Thus we have an amount of days in the train set equal to %d days" % train_span)

The first date in the train set is 2016-08-01
The last date in the train set is 2018-05-01
Thus we have an amount of days in the train set equal to 639 days


In [9]:
test_start_date = min(test_df.date)
test_end_date = max(test_df.date)

test_span = (test_end_date - test_start_date).days

print("The first date in the test set is %s" % test_start_date)
print("The last date in the test set is %s" % test_end_date)
print("Thus we have an amount of days in the test set equal to %d days" % test_span)

The first date in the test set is 2018-05-01
The last date in the test set is 2018-10-16
Thus we have an amount of days in the test set equal to 168 days


In [10]:
test_label_start_date = pd.to_datetime("2018-12-01").date()
test_label_end_date = pd.to_datetime("2019-01-31").date()

missing_days = (test_label_start_date - test_end_date).days
prediction_span = (test_label_end_date - test_label_start_date).days

print("We have to predict the sum of the transactionRevenue, for each user in the period between %s and %s, in the period that goes from %s to %s" % (test_start_date, test_end_date, test_label_start_date, test_label_end_date))
print("There is a gap of %d days between the last day of our dataset and the prediction" % missing_days)
print("We have to predict %d days after this gap" % prediction_span)

We have to predict the sum of the transactionRevenue, for each user in the period between 2018-05-01 and 2018-10-16, in the period that goes from 2018-12-01 to 2019-01-31
There is a gap of 46 days between the last day of our dataset and the prediction
We have to predict 61 days after this gap


### Set the periods for both the train and the train's label

#### First period

In [11]:
first_split_start = train_start_date
first_split_end = pd.to_datetime("2016-11-12").date()
first_split_span = (first_split_end - first_split_start).days

first_split_label_start = first_split_end + datetime.timedelta(missing_days)
first_split_label_end = first_split_label_start + datetime.timedelta(prediction_span)

print("The first split goes from %s to %s (%d days)" % (first_split_start, first_split_end, first_split_span))
print("The first split label goes from %s to %s (%d days)" % (first_split_label_start, first_split_label_end, (first_split_label_end - first_split_label_start).days))

The first split goes from 2016-08-01 to 2016-11-12 (103 days)
The first split label goes from 2016-12-28 to 2017-02-27 (61 days)


#### Second period

In [12]:
second_split_start = first_split_end + datetime.timedelta(days = 1)
second_split_end = second_split_start + datetime.timedelta(days = test_span)
second_split_span = (second_split_end - second_split_start).days

second_split_label_start = second_split_end + datetime.timedelta(missing_days)
second_split_label_end = second_split_label_start + datetime.timedelta(prediction_span)

print("The second split goes from %s to %s (%d days)" % (second_split_start, second_split_end, second_split_span))
print("The second split label goes from %s to %s (%d days)" % (second_split_label_start, second_split_label_end, (second_split_label_end - second_split_label_start).days))

The second split goes from 2016-11-13 to 2017-04-30 (168 days)
The second split label goes from 2017-06-15 to 2017-08-15 (61 days)


#### Third period (exacltly one year of difference from the request)

In [13]:
third_split_start = second_split_end + datetime.timedelta(days = 1)
third_split_end = third_split_start + datetime.timedelta(days = test_span)
third_split_span = (third_split_end - third_split_start).days

third_split_label_start = third_split_end + datetime.timedelta(missing_days)
third_split_label_end = third_split_label_start + datetime.timedelta(prediction_span)

print("The third split goes from %s to %s (%d days)" % (third_split_start, third_split_end, third_split_span))
print("The third split label goes from %s to %s (%d days)" % (third_split_label_start, third_split_label_end, (third_split_label_end - third_split_label_start).days))

The third split goes from 2017-05-01 to 2017-10-16 (168 days)
The third split label goes from 2017-12-01 to 2018-01-31 (61 days)


#### Fourth period

In [14]:
fourth_split_start = third_split_end + datetime.timedelta(days = 1)
fourth_split_end = fourth_split_start + datetime.timedelta(days = test_span)
fourth_split_span = (fourth_split_end - fourth_split_start).days

fourth_split_label_start = fourth_split_end + datetime.timedelta(missing_days)
fourth_split_label_end = fourth_split_label_start + datetime.timedelta(prediction_span)

print("The fourth split goes from %s to %s (%d days)" % (fourth_split_start, fourth_split_end, fourth_split_span))
print("The fourth split label goes from %s to %s (%d days)" % (fourth_split_label_start, fourth_split_label_end, (fourth_split_label_end - fourth_split_label_start).days))

The fourth split goes from 2017-10-17 to 2018-04-03 (168 days)
The fourth split label goes from 2018-05-19 to 2018-07-19 (61 days)


#### Fifth period

In [15]:
fifth_split_start = fourth_split_end + datetime.timedelta(days = 1)
fifth_split_end = train_end_date - datetime.timedelta(days = 1)
fifth_split_span = (fifth_split_end - fifth_split_start).days

fifth_split_label_start = fifth_split_end + datetime.timedelta(missing_days)
fifth_split_label_end = fifth_split_label_start + datetime.timedelta(prediction_span)

print("The fifth split goes from %s to %s (%d days)" % (fifth_split_start, fifth_split_end, fifth_split_span))
print("The fifth split label goes from %s to %s (%d days)" % (fifth_split_label_start, fifth_split_label_end, (fifth_split_label_end - fifth_split_label_start).days))

The fifth split goes from 2018-04-04 to 2018-04-30 (26 days)
The fifth split label goes from 2018-06-15 to 2018-08-15 (61 days)


### Split the data for both the train and the train's label (this one, from the total dataframe)

#### First data split

In [16]:
first_labeled_train = train_df[(train_df['date'].values <= first_split_end) & (train_df['date'].values >= first_split_start)]
first_temp_train_label = total_df[(total_df['date'].values <= first_split_label_end) & (total_df['date'].values >= first_split_label_start)]

print("Data %s to %s" % (min(first_labeled_train.date), max(first_labeled_train.date)))
print("Label from %s to %s" % (min(first_temp_train_label.date), max(first_temp_train_label.date)))
first_labeled_train.shape, first_temp_train_label.shape

Data 2016-08-01 to 2016-11-12
Label from 2016-12-28 to 2017-02-27


((286306, 38), (130267, 38))

#### Second data split

In [17]:
second_labeled_train = train_df[(train_df['date'].values <= second_split_end) & (train_df['date'].values >= second_split_start)]
second_temp_train_label = total_df[(total_df['date'].values <= second_split_label_end) & (total_df['date'].values >= second_split_label_start)]

print("Data %s to %s" % (min(second_labeled_train.date), max(second_labeled_train.date)))
print("Label from %s to %s" % (min(second_temp_train_label.date), max(second_temp_train_label.date)))
second_labeled_train.shape, second_temp_train_label.shape

Data 2016-11-13 to 2017-04-30
Label from 2017-06-15 to 2017-08-15


((413571, 38), (145838, 38))

#### Third data split

In [18]:
third_labeled_train = train_df[(train_df['date'].values <= third_split_end) & (train_df['date'].values >= third_split_start)]
third_temp_train_label = total_df[(total_df['date'].values <= third_split_label_end) & (total_df['date'].values >= third_split_label_start)]

print("Data %s to %s" % (min(third_labeled_train.date), max(third_labeled_train.date)))
print("Label from %s to %s" % (min(third_temp_train_label.date), max(third_temp_train_label.date)))
third_labeled_train.shape, third_temp_train_label.shape

Data 2017-05-01 to 2017-10-16
Label from 2017-12-01 to 2018-01-31


((427826, 38), (180572, 38))

#### Fourth data split

In [19]:
fourth_labeled_train = train_df[(train_df['date'].values <= fourth_split_end) & (train_df['date'].values >= fourth_split_start)]
fourth_temp_train_label = total_df[(total_df['date'].values <= fourth_split_label_end) & (total_df['date'].values >= fourth_split_label_start)]

print("Data %s to %s" % (min(fourth_labeled_train.date), max(fourth_labeled_train.date)))
print("Label from %s to %s" % (min(fourth_temp_train_label.date), max(fourth_temp_train_label.date)))
fourth_labeled_train.shape, fourth_temp_train_label.shape

Data 2017-10-17 to 2018-04-03
Label from 2018-05-19 to 2018-07-19


((505866, 38), (150761, 38))

#### Fifth data split

In [20]:
fifth_labeled_train = train_df[(train_df['date'].values <= fifth_split_end) & (train_df['date'].values >= fifth_split_start)]
fifth_temp_train_label = total_df[(total_df['date'].values <= fifth_split_label_end) & (total_df['date'].values >= fifth_split_label_start)]

print("Data %s to %s" % (min(fifth_labeled_train.date), max(fifth_labeled_train.date)))
print("Label from %s to %s" % (min(fifth_temp_train_label.date), max(fifth_temp_train_label.date)))
fifth_labeled_train.shape, fifth_temp_train_label.shape

Data 2018-04-04 to 2018-04-30
Label from 2018-06-15 to 2018-08-15


((74126, 38), (139432, 38))

### Compute the label for each data split

In [21]:
labeled_train_list = []
labeled_train_list.append(first_labeled_train)
labeled_train_list.append(second_labeled_train)
labeled_train_list.append(third_labeled_train)
labeled_train_list.append(fourth_labeled_train)
labeled_train_list.append(fifth_labeled_train)

zipped_label_list = []
zipped_label_list.append(zip_df_on_revenue(first_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(second_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(third_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(fourth_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(fifth_temp_train_label))

labels = []
for i in range(0, len(labeled_train_list)):
    labels.append(compute_label(labeled_train_list[i],zipped_label_list[i]))

In [22]:
# Reset the indices to merge data and labels
for i in range(0,len(labeled_train_list)):
    temp_label = np.nan
    labeled_train_list[i] = labeled_train_list[i].reset_index(drop=True) # Important
    
    temp_label = pd.DataFrame({'label': labels[i]})
    temp_label = temp_label.reset_index(drop=True) # Important!
    
    labeled_train_list[i] = labeled_train_list[i].join(temp_label)

In [28]:
labeled_train_list[2].label.head()

0    15990000.0
1           0.0
2           0.0
3           0.0
4           0.0
Name: label, dtype: float64

In [27]:
# Check
for i in labeled_train_list[2].itertuples():
    if i.fullVisitorId == '8934116514970143966':
        print(i.label)

15990000.0
15990000.0
15990000.0
15990000.0
15990000.0


### Training set concatenation

In [50]:
labeled_train_df = pd.concat([labeled_train_list[0], labeled_train_list[1], labeled_train_list[2], labeled_train_list[3], labeled_train_list[4]])
labeled_train_df = labeled_train_df.reset_index(drop=True)
# Size check
dropped_records = (train_df.date >= pd.to_datetime("2018-05-01").date()).sum()
if(labeled_train_df.shape[0] + dropped_records == train_df.shape[0]):
    print("The reconstruction is happended succesfully!")
labeled_train_df.shape

The reconstruction is happended succesfully!


(1707695, 39)

In [53]:
### maybe we can delete date

### Export Dataframes

In [52]:
labeled_train_df.to_csv('(4)labeled_train.csv', index = False)
test_df.to_csv('(4)labeled_test.csv', index = False)