In [1]:
import pandas as pd
import numpy as np
import datetime

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the preprocessed dataframes
train_raw_df = pd.read_csv("(3)preprocessed_train.csv",
    dtype={'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("(3)preprocessed_test.csv",
    dtype={'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((1708337, 37), (401589, 37))

### Extract the date from VisitStartTime

In [3]:
def extract_date(df):
    df['full_date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['date'] = df['full_date'].dt.date
    df.drop(['full_date'], axis=1, inplace=True)
    return df

In [4]:
train_df = extract_date(train_raw_df)
test_df = extract_date(test_raw_df)
train_df.shape, test_df.shape

((1708337, 38), (401589, 38))

### Concatenate train and test to extract the labels

In [5]:
total_df = pd.concat([train_df, test_df])
total_df.shape

(2109926, 38)

In [8]:
train_df

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,...,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,date
0,3,2,3162355547410993243,1,1508198450,6,0,False,5,106,...,True,0.0,4,13,False,11,3,58,9,2017-10-17
1,5,3,8934116514970143966,6,1508176307,3,0,False,1,24,...,True,0.0,4,13,False,12,4,57,21,2017-10-16
2,1,3,7992466427990357681,1,1508201613,3,1,True,0,106,...,True,0.0,4,13,True,12,5,58,0,2017-10-17
3,3,2,9075655783635761930,1,1508169851,3,0,False,5,106,...,True,0.0,4,13,False,12,3,58,9,2017-10-16
4,3,1,6960673291025684308,1,1508190552,3,0,False,5,106,...,True,0.0,4,13,False,12,3,58,9,2017-10-16
5,5,3,0166277907528479249,1,1508196701,3,0,False,3,86,...,True,0.0,4,13,False,12,5,21,0,2017-10-16
6,5,2,8349655975937271469,1,1508152478,3,0,False,3,52,...,True,0.0,4,13,False,12,4,57,21,2017-10-16
7,3,5,1332629902468998662,1,1508206208,3,0,False,5,106,...,True,0.0,4,13,False,12,3,58,9,2017-10-17
8,3,1,632878546807742341,1,1508207516,3,0,False,3,57,...,True,0.0,4,13,False,12,3,58,9,2017-10-17
9,3,2,1259490915281096752,2,1508165159,10,1,True,7,106,...,True,0.0,4,13,True,12,3,58,9,2017-10-16


In [50]:
temp_id = '3162355547410993243'
train_df[train_df.fullVisitorId == temp_id].device_browser.values[0]

6

### Core functions definition

In [30]:
# Create a dataframe containing only copies of fullvisitorid and totaltransactionrevenue when the latter is not null
def zip_df_on_revenue(df):
    fullID = []
    not_null_transactions = []
    for row in df.itertuples():
        if(row.totals_transactionRevenue > 0):
            fullID.append(row.fullVisitorId)
            not_null_transactions.append(row.totals_transactionRevenue)
    temp_df = pd.DataFrame({'fullVisitorId': fullID})
    not_null_transactions_temp_df = pd.DataFrame({'totals_transactionRevenue': not_null_transactions})
    temp_df = temp_df.join(not_null_transactions_temp_df)
    return temp_df

In [31]:
def compute_label(df, id_label):
    label = []

    for user in df['fullVisitorId'].values:
        temp = 0
        if(user in id_label.fullVisitorId.values):
            for record in id_label.itertuples():
                if (record.fullVisitorId == user):
                    temp += record.totals_transactionRevenue
        label.append(temp)
    return label

In [70]:
def apply_retrospective_features(df, retro_df, time):
    
    df = df.reset_index(drop=True) # Important!
    idx_revenue_retro_df = zip_df_on_revenue(retro_df)    
    
    # sum of users revenue (old and WORKING version)
    sum_transaction = []
    mean_transaction = []
    
    for user in df['fullVisitorId'].values:
        temp = 0
        count = 0
        if(user in idx_revenue_retro_df.fullVisitorId.values):
            for record in idx_revenue_retro_df.itertuples():
                if (record.fullVisitorId == user):
                    temp += record.totals_transactionRevenue
                    count += 1
        if(count == 0):      
            sum_transaction.append(temp)
            mean_transaction.append(temp/(count+1))
        else:
            sum_transaction.append(temp)
            mean_transaction.append(temp/(count))
    
    
        
    col_sum_name = 'sum_user_revenue_{}_days'.format(time)
    col_mean_name = 'mean_user_revenue_{}_days'.format(time)
    
    col_sum = pd.DataFrame({col_sum_name: sum_transaction})
    col_sum = col_sum.reset_index(drop=True) # Important!
    
    col_mean = pd.DataFrame({col_mean_name: mean_transaction})
    col_mean = col_mean.reset_index(drop=True) # Important!
        
    df = df.join(col_sum)
    df = df.join(col_mean)  
        
    #
    # number of hits per user
    #
    
    retro_df['hits_per_id'] = retro_df.groupby('fullVisitorId')['totals_hits'].transform('sum') 
    total_hits_sum = retro_df['totals_hits'].sum()
    
    hits_per_user_on_total = []
    hits_per_user = []
        
    for user in df['fullVisitorId'].values:
        temp_hits = 0
        if(user in retro_df.fullVisitorId.values):
            temp_hits = retro_df[retro_df.fullVisitorId == user].hits_per_id.values[0]      
        hits_per_user_on_total.append(temp_hits/total_hits_sum)
        hits_per_user.append(temp_hits)
    
    # add the total hits column
    col_hits_name = 'hits_number_{}_days_before'.format(time)
    col_hits = pd.DataFrame({col_hits_name: hits_per_user})
    col_hits = col_hits.reset_index(drop=True) # Important!
    df = df.join(col_hits)

    # add the total hits column weighted on the totoal amount of hits in the previous period
    col_hits_on_total_name = 'hits_number_on_total_users_hits_{}_days_before'.format(time)
    col_hits_on_total = pd.DataFrame({col_hits_on_total_name: hits_per_user_on_total})
    col_hits_on_total = col_hits_on_total.reset_index(drop=True) # Important!
    df = df.join(col_hits_on_total)
    
    
    #
    # number of pageviews per user
    #
    
    retro_df['pageviews_per_id'] = retro_df.groupby('fullVisitorId')['totals_pageviews'].transform('sum') 
    total_pageviews = retro_df['totals_pageviews'].sum()
    
    pageviews_per_user_on_total = []
    pageviews_per_user = []
        
    for user in df['fullVisitorId'].values:
        temp_pageviews = 0
        if(user in retro_df.fullVisitorId.values):
            temp_pageviews = retro_df[retro_df.fullVisitorId == user].pageviews_per_id.values[0]
        pageviews_per_user_on_total.append(temp_pageviews/total_pageviews)
        pageviews_per_user.append(temp_pageviews)
    
    # add the total pageviews column
    col_pageviews_name = 'pageviews_number_{}_days_before'.format(time)
    col_pageviews = pd.DataFrame({col_pageviews_name: pageviews_per_user})
    col_pageviews = col_pageviews.reset_index(drop=True) # Important! 
    df = df.join(col_pageviews)
    
    # add the total hits column weighted on the totoal amount of hits in the previous period
    col_pageviews_on_total_name = 'pageviews_number_on_total_users_pageviews_{}_days_before'.format(time)
    col_pageviews_on_total = pd.DataFrame({col_pageviews_on_total_name: pageviews_per_user_on_total})
    col_pageviews_on_total = col_pageviews_on_total.reset_index(drop=True) # Important! 
    df = df.join(col_pageviews_on_total)
    
    return df

In [12]:
def apply_window_features(df):
    # Like the previous but applied to current dataframes
    return df

## Sliding window version (no overlapping)

In [33]:
train_start_date = min(train_df.date)
train_end_date = max(train_df.date)

train_span = (train_end_date - train_start_date).days + 1

print("The first date in the train set is %s" % train_start_date)
print("The last date in the train set is %s" % train_end_date)
print("Thus we have an amount of days in the train set equal to %d days" % train_span)

The first date in the train set is 2016-08-01
The last date in the train set is 2018-05-01
Thus we have an amount of days in the train set equal to 639 days


In [34]:
test_start_date = min(test_df.date)
test_end_date = max(test_df.date)

test_span = (test_end_date - test_start_date).days

print("The first date in the test set is %s" % test_start_date)
print("The last date in the test set is %s" % test_end_date)
print("Thus we have an amount of days in the test set equal to %d days" % test_span)

The first date in the test set is 2018-05-01
The last date in the test set is 2018-10-16
Thus we have an amount of days in the test set equal to 168 days


In [35]:
test_label_start_date = pd.to_datetime("2018-12-01").date()
test_label_end_date = pd.to_datetime("2019-01-31").date()

missing_days = (test_label_start_date - test_end_date).days
prediction_span = (test_label_end_date - test_label_start_date).days

print("We have to predict the sum of the transactionRevenue, for each user in the period between %s and %s, in the period that goes from %s to %s" % (test_start_date, test_end_date, test_label_start_date, test_label_end_date))
print("There is a gap of %d days between the last day of our dataset and the prediction" % missing_days)
print("We have to predict %d days after this gap" % prediction_span)

We have to predict the sum of the transactionRevenue, for each user in the period between 2018-05-01 and 2018-10-16, in the period that goes from 2018-12-01 to 2019-01-31
There is a gap of 46 days between the last day of our dataset and the prediction
We have to predict 61 days after this gap


### Set the periods for both the train and the train's label

#### First period

In [36]:
first_split_start = train_start_date
first_split_end = pd.to_datetime("2016-11-12").date()
first_split_span = (first_split_end - first_split_start).days

first_split_label_start = first_split_end + datetime.timedelta(missing_days)
first_split_label_end = first_split_label_start + datetime.timedelta(prediction_span)

print("The first split goes from %s to %s (%d days)" % (first_split_start, first_split_end, first_split_span))
print("The first split label goes from %s to %s (%d days)" % (first_split_label_start, first_split_label_end, (first_split_label_end - first_split_label_start).days))

The first split goes from 2016-08-01 to 2016-11-12 (103 days)
The first split label goes from 2016-12-28 to 2017-02-27 (61 days)


#### Second period

In [37]:
second_split_start = first_split_end + datetime.timedelta(days = 1)
second_split_end = second_split_start + datetime.timedelta(days = test_span)
second_split_span = (second_split_end - second_split_start).days

second_split_label_start = second_split_end + datetime.timedelta(missing_days)
second_split_label_end = second_split_label_start + datetime.timedelta(prediction_span)

print("The second split goes from %s to %s (%d days)" % (second_split_start, second_split_end, second_split_span))
print("The second split label goes from %s to %s (%d days)" % (second_split_label_start, second_split_label_end, (second_split_label_end - second_split_label_start).days))

The second split goes from 2016-11-13 to 2017-04-30 (168 days)
The second split label goes from 2017-06-15 to 2017-08-15 (61 days)


#### Third period (exacltly one year of difference from the request)

In [38]:
third_split_start = second_split_end + datetime.timedelta(days = 1)
third_split_end = third_split_start + datetime.timedelta(days = test_span)
third_split_span = (third_split_end - third_split_start).days

third_split_label_start = third_split_end + datetime.timedelta(missing_days)
third_split_label_end = third_split_label_start + datetime.timedelta(prediction_span)

print("The third split goes from %s to %s (%d days)" % (third_split_start, third_split_end, third_split_span))
print("The third split label goes from %s to %s (%d days)" % (third_split_label_start, third_split_label_end, (third_split_label_end - third_split_label_start).days))

The third split goes from 2017-05-01 to 2017-10-16 (168 days)
The third split label goes from 2017-12-01 to 2018-01-31 (61 days)


#### Fourth period

In [39]:
fourth_split_start = third_split_end + datetime.timedelta(days = 1)
fourth_split_end = fourth_split_start + datetime.timedelta(days = 97)#80
fourth_split_span = (fourth_split_end - fourth_split_start).days

fourth_split_label_start = fourth_split_end + datetime.timedelta(missing_days)
fourth_split_label_end = fourth_split_label_start + datetime.timedelta(prediction_span)

print("The fourth split goes from %s to %s (%d days)" % (fourth_split_start, fourth_split_end, fourth_split_span))
print("The fourth split label goes from %s to %s (%d days)" % (fourth_split_label_start, fourth_split_label_end, (fourth_split_label_end - fourth_split_label_start).days))

The fourth split goes from 2017-10-17 to 2018-01-22 (97 days)
The fourth split label goes from 2018-03-09 to 2018-05-09 (61 days)


#### Fifth period

In [40]:
fifth_split_start = fourth_split_end + datetime.timedelta(days = 1)
fifth_split_end = train_end_date - datetime.timedelta(days = 1)
fifth_split_span = (fifth_split_end - fifth_split_start).days

fifth_split_label_start = fifth_split_end + datetime.timedelta(missing_days)
fifth_split_label_end = fifth_split_label_start + datetime.timedelta(prediction_span)

print("The fifth split goes from %s to %s (%d days)" % (fifth_split_start, fifth_split_end, fifth_split_span))
print("The fifth split label goes from %s to %s (%d days)" % (fifth_split_label_start, fifth_split_label_end, (fifth_split_label_end - fifth_split_label_start).days))

The fifth split goes from 2018-01-23 to 2018-04-30 (97 days)
The fifth split label goes from 2018-06-15 to 2018-08-15 (61 days)


### Split the data for both the train and the train's label (this one, from the total dataframe)

In [41]:
#used to define retrospective features
long_time = 62
medium_time = 31

#### First data split

In [67]:
first_labeled_train = train_df[(train_df['date'].values <= first_split_end) & (train_df['date'].values >= first_split_start)]
first_temp_train_label = total_df[(total_df['date'].values <= first_split_label_end) & (total_df['date'].values >= first_split_label_start)]

print("Data %s to %s" % (min(first_labeled_train.date), max(first_labeled_train.date)))
print("Label from %s to %s" % (min(first_temp_train_label.date), max(first_temp_train_label.date)))
first_labeled_train.shape, first_temp_train_label.shape

Data 2016-08-01 to 2016-11-12
Label from 2016-12-28 to 2017-02-27


((286306, 38), (130267, 38))

In [68]:
first_retro_train_long = train_df[(train_df['date'].values <= first_split_start) & (train_df['date'].values >= first_split_start - datetime.timedelta(days = long_time))]
first_retro_train_medium = train_df[(train_df['date'].values <= first_split_start) & (train_df['date'].values >= first_split_start - datetime.timedelta(days = medium_time))]
first_retro_train_long.shape, first_retro_train_medium.shape

((1296, 38), (1296, 38))

In [71]:
first_labeled_train = apply_retrospective_features(first_labeled_train, first_retro_train_long, long_time)
first_labeled_train = apply_retrospective_features(first_labeled_train, first_retro_train_medium, medium_time)
first_labeled_train.shape

(286306, 50)

#### Second data split

In [80]:
second_labeled_train = train_df[(train_df['date'].values <= second_split_end) & (train_df['date'].values >= second_split_start)]
second_temp_train_label = total_df[(total_df['date'].values <= second_split_label_end) & (total_df['date'].values >= second_split_label_start)]

print("Data %s to %s" % (min(second_labeled_train.date), max(second_labeled_train.date)))
print("Label from %s to %s" % (min(second_temp_train_label.date), max(second_temp_train_label.date)))
second_labeled_train.shape, second_temp_train_label.shape

Data 2016-11-13 to 2017-04-30
Label from 2017-06-15 to 2017-08-15


((413571, 38), (145838, 38))

In [81]:
second_retro_train_long = train_df[(train_df['date'].values <= second_split_start) & (train_df['date'].values >= second_split_start - datetime.timedelta(days = long_time))]
second_retro_train_medium = train_df[(train_df['date'].values <= second_split_start) & (train_df['date'].values >= second_split_start - datetime.timedelta(days = medium_time))]
second_retro_train_long.shape, second_retro_train_medium.shape

((190496, 38), (111453, 38))

In [None]:
second_labeled_train = apply_retrospective_features(second_labeled_train, second_retro_train_long, long_time)
second_labeled_train = apply_retrospective_features(second_labeled_train, second_retro_train_medium, medium_time)
second_labeled_train.shape

#### Third data split

In [24]:
third_labeled_train = train_df[(train_df['date'].values <= third_split_end) & (train_df['date'].values >= third_split_start)]
third_temp_train_label = total_df[(total_df['date'].values <= third_split_label_end) & (total_df['date'].values >= third_split_label_start)]

print("Data %s to %s" % (min(third_labeled_train.date), max(third_labeled_train.date)))
print("Label from %s to %s" % (min(third_temp_train_label.date), max(third_temp_train_label.date)))
third_labeled_train.shape, third_temp_train_label.shape

Data 2017-05-01 to 2017-10-16
Label from 2017-12-01 to 2018-01-31


((427826, 38), (180572, 38))

In [25]:
third_retro_train_long = train_df[(train_df['date'].values <= third_split_start) & (train_df['date'].values >= third_split_start - datetime.timedelta(days = long_time))]
third_retro_train_medium = train_df[(train_df['date'].values <= third_split_start) & (train_df['date'].values >= third_split_start - datetime.timedelta(days = medium_time))]
third_retro_train_long.shape, third_retro_train_medium.shape

((142111, 38), (72017, 38))

In [26]:
third_labeled_train = apply_retrospective_features(third_labeled_train, third_retro_train_long, long_time)
third_labeled_train = apply_retrospective_features(third_labeled_train, third_retro_train_medium, medium_time)
third_labeled_train.shape

(427826, 42)

#### Fourth data split

In [27]:
fourth_labeled_train = train_df[(train_df['date'].values <= fourth_split_end) & (train_df['date'].values >= fourth_split_start)]
fourth_temp_train_label = total_df[(total_df['date'].values <= fourth_split_label_end) & (total_df['date'].values >= fourth_split_label_start)]

print("Data %s to %s" % (min(fourth_labeled_train.date), max(fourth_labeled_train.date)))
print("Label from %s to %s" % (min(fourth_temp_train_label.date), max(fourth_temp_train_label.date)))
fourth_labeled_train.shape, fourth_temp_train_label.shape

Data 2017-10-17 to 2018-01-22
Label from 2018-03-09 to 2018-05-09


((294701, 38), (178602, 38))

In [28]:
fourth_retro_train_long = train_df[(train_df['date'].values <= fourth_split_start) & (train_df['date'].values >= fourth_split_start - datetime.timedelta(days = long_time))]
fourth_retro_train_medium = train_df[(train_df['date'].values <= fourth_split_start) & (train_df['date'].values >= fourth_split_start - datetime.timedelta(days = medium_time))]
fourth_retro_train_long.shape, fourth_retro_train_medium.shape

((191324, 38), (108679, 38))

In [29]:
fourth_labeled_train = apply_retrospective_features(fourth_labeled_train, fourth_retro_train_long, long_time)
fourth_labeled_train = apply_retrospective_features(fourth_labeled_train, fourth_retro_train_medium, medium_time)
fourth_labeled_train.shape

(294701, 42)

#### Fifth data split

In [30]:
fifth_labeled_train = train_df[(train_df['date'].values <= fifth_split_end) & (train_df['date'].values >= fifth_split_start)]
fifth_temp_train_label = total_df[(total_df['date'].values <= fifth_split_label_end) & (total_df['date'].values >= fifth_split_label_start)]

print("Data %s to %s" % (min(fifth_labeled_train.date), max(fifth_labeled_train.date)))
print("Label from %s to %s" % (min(fifth_temp_train_label.date), max(fifth_temp_train_label.date)))
fifth_labeled_train.shape, fifth_temp_train_label.shape

Data 2018-01-23 to 2018-04-30
Label from 2018-06-15 to 2018-08-15


((285291, 38), (139432, 38))

In [31]:
fifth_retro_train_long = train_df[(train_df['date'].values <= fifth_split_start) & (train_df['date'].values >= fifth_split_start - datetime.timedelta(days = long_time))]
fifth_retro_train_medium = train_df[(train_df['date'].values <= fifth_split_start) & (train_df['date'].values >= fifth_split_start - datetime.timedelta(days = medium_time))]
fifth_retro_train_long.shape, fifth_retro_train_medium.shape

((186385, 38), (79308, 38))

In [32]:
fifth_labeled_train = apply_retrospective_features(fifth_labeled_train, fifth_retro_train_long, long_time)
fifth_labeled_train = apply_retrospective_features(fifth_labeled_train, fifth_retro_train_medium, medium_time)
fifth_labeled_train.shape

(285291, 42)

### Test restrospective

In [34]:
test_tetro_long = total_df[(total_df['date'].values <= test_start_date) & (total_df['date'].values >= test_start_date - datetime.timedelta(days = long_time))]
test_retro_medium  = total_df[(total_df['date'].values <= test_start_date) & (total_df['date'].values >= test_start_date - datetime.timedelta(days = medium_time))]
test_tetro_long.shape, test_retro_medium.shape

((182921, 38), (87091, 38))

In [35]:
test_df = apply_retrospective_features(test_df, test_tetro_long, long_time)
test_df = apply_retrospective_features(test_df, test_retro_medium, medium_time)
test_df.shape

(401589, 42)

### Compute the label for each data split

In [36]:
labeled_train_list = []
labeled_train_list.append(first_labeled_train)
labeled_train_list.append(second_labeled_train)
labeled_train_list.append(third_labeled_train)
labeled_train_list.append(fourth_labeled_train)
labeled_train_list.append(fifth_labeled_train)

zipped_label_list = []
zipped_label_list.append(zip_df_on_revenue(first_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(second_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(third_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(fourth_temp_train_label))
zipped_label_list.append(zip_df_on_revenue(fifth_temp_train_label))

labels = []
for i in range(0, len(labeled_train_list)):
    labels.append(compute_label(labeled_train_list[i],zipped_label_list[i]))

In [37]:
# Check how many labels greather than 0 there are in each split
for i in range (0, len(labels)):
    count = 0
    for j in labels[i]:
        if j > 0:
            count += 1
    print("In the split number %d there are %d samples on %d greather than 0 (%f)" % (i, count, len(labels[i]), (count/len(labels[i]))*100))

In the split number 0 there are 474 samples on 286306 greather than 0 (0.165557)
In the split number 1 there are 664 samples on 413571 greather than 0 (0.160553)
In the split number 2 there are 643 samples on 427826 greather than 0 (0.150295)
In the split number 3 there are 501 samples on 294701 greather than 0 (0.170003)
In the split number 4 there are 178 samples on 285291 greather than 0 (0.062392)


In [38]:
# Reset the indices to merge data and labels
for i in range(0,len(labeled_train_list)):
    temp_label = np.nan
    labeled_train_list[i] = labeled_train_list[i].reset_index(drop=True) # Important
    
    temp_label = pd.DataFrame({'label': labels[i]})
    temp_label = temp_label.reset_index(drop=True) # Important!
    
    labeled_train_list[i] = labeled_train_list[i].join(temp_label)

In [39]:
labeled_train_list[2].label.head()

0    15990000.0
1           0.0
2           0.0
3           0.0
4           0.0
Name: label, dtype: float64

In [40]:
# Check
for i in labeled_train_list[2].itertuples():
    if i.fullVisitorId == '8934116514970143966':
        print(i.label)

15990000.0
15990000.0
15990000.0
15990000.0
15990000.0


### Training set concatenation

In [41]:
labeled_train_df = pd.concat([labeled_train_list[0], labeled_train_list[1], labeled_train_list[2], labeled_train_list[3], labeled_train_list[4]])
labeled_train_df = labeled_train_df.reset_index(drop=True)
# Size check
dropped_records = (train_df.date >= pd.to_datetime("2018-05-01").date()).sum()
if(labeled_train_df.shape[0] + dropped_records == train_df.shape[0]):
    print("The reconstruction is happended succesfully!")
labeled_train_df.shape

The reconstruction is happended succesfully!


(1707695, 43)

In [42]:
### maybe we can delete date

### Export Dataframes

In [43]:
labeled_train_df.to_csv('(4)labeled_train.csv', index = False)
test_df.to_csv('(4)labeled_test.csv', index = False)