# Build the training and testing set from the collected data 

## Set-up

In [6]:
import numpy as np 
import pandas as pd 

In [7]:
import pickle as pkl

In [8]:
from sklearn.model_selection import train_test_split

## Dataset

In [9]:
data_path = "./backup/vc_data_mod.pkl"
data = pd.read_pickle(data_path)

In [10]:
data.columns

Index(['num_likes', 'price', 'we_love_tag', 'men', 'women', 'bags', 'clothing',
       'jewellery', 'shoes', 'backpacks',
       ...
       'silver', 'turquoise', 'white', 'yellow', 'no size', 'not_common_size',
       'rare_size', 'very_common_size', 'other_countries', 'united kingdom'],
      dtype='object', length=158)

## Features and targets

In [11]:
# feature vector
X = np.array(data.drop(
    labels=["num_likes", "price", "we_love_tag"],
    axis= 1 
))

In [12]:
# target: price
# sub-targets: num_likes and we_love_tag
y = data.loc[ :, ["price", "num_likes", "we_love_tag"] ].values

## Train/test split

In [13]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42
)

In [16]:
train_data = {
    "X":X_train,
    "y": y_train
}

In [18]:
test_data = {
    "X": X_test, 
    "y": y_test
}

## Save training and testing sets 

In [20]:
save_path_train = "./backup/train_data.pkl"
save_path_test = "./backup/test_data.pkl"

In [21]:
with open(save_path_train,"wb") as file:
    pkl.dump(train_data, file)

In [22]:
with open(save_path_test,"wb") as file:
    pkl.dump(test_data, file)