In [1]:
import s3fs
import pandas as pd
import numpy as np
import boto3
import io
import pickle
from sklearn.preprocessing import MinMaxScaler

s3_client = boto3.client('s3')
fs = s3fs.S3FileSystem()

In [2]:
fondos       = pd.read_csv('s3://bme-bucket/asignacion_fondos.csv', nrows = 79899)
#dinero       = pd.read_csv('s3://bme-bucket/asignacion_dinero.csv', nrows = 79899)
perfiles     = pd.read_csv('s3://bme-bucket/perfiles_inversores.csv', nrows = 79899)
tabla_fondos = pd.read_csv('s3://bme-bucket/tabla_fondos.csv', index_col=0)

<h2>Target Engineering</h2>

In [3]:
scaler = MinMaxScaler()
for col in tabla_fondos.columns[2:12]:
    if col != 'aportacion_minima':
        tabla_fondos[col] = tabla_fondos[col].apply(lambda x: float(x[:-1])/100)
    tabla_fondos[col] = scaler.fit_transform(tabla_fondos[col].values.reshape(-1,1))



In [4]:
Y = np.zeros(shape = (1,80))
for client_n in range(fondos.shape[0]):
    row = np.array([])
    for fondo in fondos.columns:
        sp_fund = fondos[fondo][client_n]
        if sp_fund == 'Ninguno':
            row = np.append(row, np.zeros(shape = (1,10)))
        else:
            row = np.append(row,
                tabla_fondos[tabla_fondos['Nombre fondo'] == sp_fund].iloc[:, 2:].values)
    Y = np.vstack((Y, row.reshape((1,80))))

Y = Y[1:]

In [5]:
# upload without using disk
my_array_data = io.BytesIO()
pickle.dump(Y, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/Y_minmaxscaled.pkl')

In [6]:
# download without using disk
my_array_data2 = io.BytesIO()
s3_client.download_fileobj('bme-bucket', 'engineered_data/Y_minmaxscaled.pkl', my_array_data2)
my_array_data2.seek(0)
Y = pickle.load(my_array_data2)

In [160]:
# check that everything is correct
np.allclose(Y, Y2)

True

<h2>Feature engineering</h2>

In [7]:
orderlists = [[1,3,0,4,2],[2,1,0,3],[1,3,0,2],[2,0,1],[3,0,1,2],[0,2,1,4,3]]
columns = [0,1,2,3,7,9]
imputationsDict = {}
for i in range(6):
    differentValues = perfiles.iloc[:, columns[i]].value_counts().index.values[orderlists[i]]
    imputedValues   = np.linspace(0, len(differentValues) - 1, len(differentValues))
    subdict = {key:value for key, value in zip(differentValues, imputedValues)}
    imputationsDict[perfiles.columns[columns[i]]] = subdict

In [8]:
for i in imputationsDict.keys():
    perfiles[i] = perfiles[i].apply(lambda x : imputationsDict[i][x])

<h2>Some sklearn preprocessing</h2>

In [9]:
for i in perfiles.columns:
    perfiles[i] = scaler.fit_transform(perfiles[i].values.reshape(-1,1))



In [10]:
X = perfiles.values

In [11]:
# upload without using disk
my_array_data = io.BytesIO()
pickle.dump(X, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/X.pkl')

<h2>Data split on train validation and test</h2>

In [26]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.1, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=123)

In [27]:
# upload without using disk
my_array_data = io.BytesIO()
pickle.dump(X_train, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/experiment-1/X_train.pkl')

my_array_data = io.BytesIO()
pickle.dump(X_test, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/experiment-1/X_test.pkl')

my_array_data = io.BytesIO()
pickle.dump(X_val, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/experiment-1/X_val.pkl')

my_array_data = io.BytesIO()
pickle.dump(y_train, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/experiment-1/Y_train.pkl')

my_array_data = io.BytesIO()
pickle.dump(y_test, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/experiment-1/Y_test.pkl')

my_array_data = io.BytesIO()
pickle.dump(y_val, my_array_data)
my_array_data.seek(0)
s3_client.upload_fileobj(my_array_data, 'bme-bucket', 'engineered_data/experiment-1/Y_val.pkl')