In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
try:
    import dill as pickle
except:
    !pip install dill
    import dill as pickle
from tqdm import tqdm
import torch
from src import DataTransforms
from sklearn.model_selection import train_test_split
import os
%matplotlib inline

In [2]:
with open('artifacts/transforms.pth','rb') as f:
    transforms = pickle.load(f)

In [3]:
with open('artifacts/pca.pkl', 'rb') as f:
    pca = pickle.load(f)

In [4]:
%%time
train = pd.read_csv('../data/Udacity_MAILOUT_052018_TRAIN.csv', sep = ';', low_memory = False, index_col = 'LNR')

CPU times: user 1.5 s, sys: 293 ms, total: 1.8 s
Wall time: 1.79 s


In [5]:
%%time
customers = pd.read_csv('../data/Udacity_CUSTOMERS_052018.csv', sep = ';', low_memory = False, index_col = 'LNR')

CPU times: user 11 s, sys: 1.01 s, total: 12 s
Wall time: 12 s


In [6]:
data_y = train['RESPONSE']
data_x = train.drop(columns = ['RESPONSE'])

In [7]:
data_x = DataTransforms.TransformData(data_x, transforms)

100%|██████████| 365/365 [00:00<00:00, 599.30it/s]


In [8]:
%%time
#loading customers data, will merge it with the train data with target value 0.95 given the intuition that 
#customers are very likely to react postively to adds
customers = DataTransforms.TransformData(customers, transforms)
customers_y = pd.Series([0.95] * customers.shape[0])
customers_y.name = 'RESPONSE'

100%|██████████| 365/365 [00:01<00:00, 266.51it/s]


CPU times: user 11.9 s, sys: 802 ms, total: 12.7 s
Wall time: 1.79 s


In [9]:
%%time
#merge train and customer data
data_x = pd.concat([data_x, customers])
data_y = pd.concat([data_y, customers_y])
customers = customers_y = None

CPU times: user 2.38 s, sys: 878 ms, total: 3.26 s
Wall time: 1.18 s


In [10]:
%%time
#transform data
data_x = pca.transform(data_x.values)
data_y = data_y.values

CPU times: user 2.47 s, sys: 637 ms, total: 3.11 s
Wall time: 443 ms


In [11]:
#split into train/val data with 15% of the data as validation
train_x, val_x, train_y, val_y = train_test_split(data_x, data_y, test_size = 0.15)

In [12]:
%time
test_x = pd.read_csv('../data/Udacity_MAILOUT_052018_TEST.csv', sep = ';', low_memory = False, index_col = 'LNR')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [13]:
%%time
test_x = DataTransforms.TransformData(test_x, transforms)
test_x = pca.transform(test_x.values)

100%|██████████| 365/365 [00:00<00:00, 616.27it/s]


CPU times: user 2 s, sys: 250 ms, total: 2.25 s
Wall time: 901 ms


In [14]:
print(train_x.shape, val_x.shape, test_x.shape)

(199421, 135) (35193, 135) (42833, 135)


In [15]:
train_y = train_y.reshape(-1, 1)
val_y = val_y.reshape(-1, 1)

In [16]:
train = np.concatenate([train_y, train_x], axis = 1)

In [17]:
val = np.concatenate([val_y, val_x], axis = 1)

In [18]:
save_path = 'data'
if not os.path.exists(save_path):
    os.makedirs(save_path)

dataset = {'train' : train, 'val' : val, 'test' : test_x}
    
for task in tqdm(dataset):
    path = os.path.join(save_path, task + '.csv')
    pd.DataFrame(dataset[task]).to_csv(path, header = None, index = None)


100%|██████████| 3/3 [01:06<00:00, 22.16s/it]
