## Vaex

### Vaex is a python library for lazy Out-of-Core DataFrames (similar to Pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (109) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, a zero memory copy policy, and lazy computations for best performance (no memory wasted).

### Documentação: https://vaex.readthedocs.io/en/latest/

In [None]:
!python3.7 -m pip install vaex

In [None]:
!python3.7 -m pip install lightgbm

In [1]:
import vaex
import vaex.ml

import pylab as plt


df = vaex.ml.datasets.load_iris()
df.head(10)

#,sepal_length,sepal_width,petal_length,petal_width,class_
0,5.9,3.0,4.2,1.5,1
1,6.1,3.0,4.6,1.4,1
2,6.6,2.9,4.6,1.3,1
3,6.7,3.3,5.7,2.1,2
4,5.5,4.2,1.4,0.2,0
5,5.1,3.4,1.5,0.2,0
6,6.3,2.3,4.4,1.3,1
7,5.0,3.5,1.3,0.3,0
8,6.7,3.1,5.6,2.4,2
9,6.0,2.2,4.0,1.0,1


In [2]:
df.export("shuffled.hdf5", shuffle=True)
df = vaex.open("shuffled.hdf5")
df_train, df_test = df.ml.train_test_split(test_size=0.2)



In [3]:
features = df_train.column_names[:4]
features

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [4]:
pca = vaex.ml.PCA(features=features, n_components=4)
df_train = pca.fit_transform(df_train)
df_train

#,sepal_length,sepal_width,petal_length,petal_width,class_,random_index,PCA_0,PCA_1,PCA_2,PCA_3
0,5.8,2.7,3.9,1.2,1,78,-0.018222084288481186,-0.3235052708555063,0.1837022143444292,0.06611091384145482
1,6.7,3.3,5.7,2.1,2,72,-2.1658172406057297,0.2941940746810152,-0.2697038796661374,-0.07778306449165195
2,4.9,3.1,1.5,0.1,0,145,2.787990594917355,-0.08841693229844566,0.21099795110923858,-0.029590783600891846
3,5.8,2.7,5.1,1.9,2,133,-1.2939741461746659,-0.6040979403527942,-0.28054105899306186,-0.02445736746290056
4,5.6,2.9,3.6,1.3,1,65,0.2913818447569352,-0.2608728371903721,-0.09250960003268069,0.1287975868376195
...,...,...,...,...,...,...,...,...,...,...
115,6.9,3.1,4.9,1.5,1,42,-1.356420482347309,0.47826452437634603,0.3442030059400766,0.04101631713551046
116,6.4,3.2,4.5,1.5,1,3,-0.8224633349262026,0.29724389972743415,0.026023022093870085,0.011304433161908406
117,4.6,3.4,1.4,0.3,0,77,2.9351913136333163,-0.06487365287140033,-0.2431103314969285,-0.06912975262895615
118,4.8,3.0,1.4,0.1,0,129,2.902993121978754,-0.2076437774048363,0.21568799943502487,0.02078164558582496


In [5]:
from sklearn.ensemble import RandomForestClassifier
import vaex.ml.sklearn

train_features = df_train.get_column_names(regex='PCA_.*')

target = 'class_'

randomForest = RandomForestClassifier(
                                          max_depth=5,
                                          n_estimators=100,
                                          random_state=42)


model = vaex.ml.sklearn.Predictor(features=train_features,
                                  target=target,
                                  model=randomForest,
                                  prediction_name='prediction')


model.fit(df=df_train)
df_train = model.transform(df=df_train)

df_train

#,sepal_length,sepal_width,petal_length,petal_width,class_,random_index,PCA_0,PCA_1,PCA_2,PCA_3,prediction
0,5.8,2.7,3.9,1.2,1,78,-0.018222084288481186,-0.3235052708555063,0.1837022143444292,0.06611091384145482,1
1,6.7,3.3,5.7,2.1,2,72,-2.1658172406057297,0.2941940746810152,-0.2697038796661374,-0.07778306449165195,2
2,4.9,3.1,1.5,0.1,0,145,2.787990594917355,-0.08841693229844566,0.21099795110923858,-0.029590783600891846,0
3,5.8,2.7,5.1,1.9,2,133,-1.2939741461746659,-0.6040979403527942,-0.28054105899306186,-0.02445736746290056,2
4,5.6,2.9,3.6,1.3,1,65,0.2913818447569352,-0.2608728371903721,-0.09250960003268069,0.1287975868376195,1
...,...,...,...,...,...,...,...,...,...,...,...
115,6.9,3.1,4.9,1.5,1,42,-1.356420482347309,0.47826452437634603,0.3442030059400766,0.04101631713551046,1
116,6.4,3.2,4.5,1.5,1,3,-0.8224633349262026,0.29724389972743415,0.026023022093870085,0.011304433161908406,1
117,4.6,3.4,1.4,0.3,0,77,2.9351913136333163,-0.06487365287140033,-0.2431103314969285,-0.06912975262895615,0
118,4.8,3.0,1.4,0.1,0,129,2.902993121978754,-0.2076437774048363,0.21568799943502487,0.02078164558582496,0
