# Chapter One - Data Processing with Optimus 

## Why Optimus exist? 

Create Optimus with pandas as engine

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append("../../Optimus")

In [4]:
from optimus import Optimus 
op = Optimus("pandas") 
op.engine

C:\Users\argenisleon\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\argenisleon\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


'pandas'

In [4]:
op = Optimus("dask") 
op.engine

'dask'

### One API to rule them all 

#### pandas

In [5]:
import pandas as pd 
import numpy as np 

data = {"A":[0, 1,2,3,4,5]} 
pdf = pd.DataFrame(data) 
pdf["A"] = np.sqrt(pdf["A"]) 
pdf 

Unnamed: 0,A
0,0.0
1,1.0
2,1.414214
3,1.732051
4,2.0
5,2.236068


#### cuDF

In [6]:
import cudf 

# You need to use dtype=”float” to return floats 

cdf = cudf.DataFrame(data, dtype = "float") 
cdf["A"] = cdf["A"].sqrt() 
cdf 

ModuleNotFoundError: No module named 'cudf'

#### dask

In [7]:
from dask import dataframe as dd  
# You cannot create a dataframe directly from a dict. You need to use a pandas dataframe. 
ddf = dd.from_pandas(pdf, npartitions=1) 

import dask.array as da 
ddf["A"]=da.sqrt(ddf.A) 

# You need to user compute to materialize the pandas dataframe 
ddf.compute() 

Unnamed: 0,A
0,0.0
1,1.0
2,1.189207
3,1.316074
4,1.414214
5,1.495349


#### dask cuDF

In [None]:
import dask_cudf 

dcdf = dask_cudf.from_cudf(cdf, npartitions=2) 
dcdf.map_partitions(cudf.sqrt).compute() 

#### vaex

In [None]:
import vaex 
import numpy as np 
  
data = {"A":[0, 1,2,3,4,5]} 
vdf = vaex.DataFrame(data) 
vdf["A"] = np.sqrt(pdf["A"]) 
vdf 

In [None]:
#### Ibis

#### Spark

In [None]:
import pyspark


In [None]:
import dask_cudf 
dcdf = dask_cudf.from_cudf(cdf, npartitions=2) 
dcdf.map_partitions(cudf.sqrt).compute() 

In [None]:
from optimus import Optimus 

op = Optimus(“dask”) 

df = op.create.dataframe({"A":[0, 1,2,3,4,5]}) 

df = df.cols.sqrt(“A”) 

## Expression

In [32]:
from optimus import Optimus 
op = Optimus("pandas") 
df = op.create.dataframe({"A":[1]}) 

from optimus.expressions import parser
expr = parser("SQRT(A) + SIN(A)", "df")


In [34]:
df.meta

{}

In [33]:
from optimus.functions import F
eval(expr)

A  1 (float64)  not nullable
1.8414709848078965


<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>

In [6]:
df.cols.sqrt()

A  1 (float64)  not nullable
0.0
1.0
1.4142135623730951
1.7320508075688772
2.0
2.23606797749979


<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>

In [11]:
df["A"] + df["A"]

A  1 (int64)  not nullable
0
2
4
6
8
10


<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>

In [23]:
from optimus.functions import F
F.sin(df["A"]) + F.cos(df["A"])

A  1 (float64)  not nullable
1.0
1.3817732906760365
0.4931505902785393
-0.8488724885405782
-1.4104461161715405
-0.6752620891999122


<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>

## Meta

In [10]:
from optimus import Optimus 
op = Optimus("pandas") 
df = op.load.csv("foo.txt", sep=",")
df.meta

{'file_name': 'foo.txt', 'name': 'foo.txt'}

In [12]:
df

name  1 (object)  not nullable,function  2 (object)  not nullable
Optimus,⋅leader
Bumblebee,⋅espionage
eject,⋅ELECTRONIC⋅SURVEILLANCE


<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>

In [11]:
df.profile()

function function


{'columns': {'name': {'stats': {'match': 3,
    'missing': 0,
    'mismatch': 0,
    'profiler_dtype': {'dtype': 'string'},
    'frequency': [{'value': 'Optimus', 'count': 1},
     {'value': 'Bumblebee', 'count': 1},
     {'value': 'eject', 'count': 1}],
    'count_uniques': 3},
   'dtype': 'object'},
  'function': {'stats': {'match': 3,
    'missing': 0,
    'mismatch': 0,
    'profiler_dtype': {'dtype': 'object'},
    'frequency': [{'value': ' ELECTRONIC SURVEILLANCE', 'count': 1},
     {'value': ' leader', 'count': 1},
     {'value': ' espionage', 'count': 1}],
    'count_uniques': 3},
   'dtype': 'object'}},
 'name': 'foo.txt',
 'file_name': 'foo.txt',
 'summary': {'cols_count': 2,
  'rows_count': 3,
  'dtypes_list': ['object'],
  'total_count_dtypes': 1,
  'missing_count': 0,
  'p_missing': 0.0}}

In [59]:
df.cols.lower()

name  1 (object)  not nullable,function  2 (object)  not nullable
optimus,⋅leader
bumblebee,⋅espionage
eject,⋅electronic⋅surveillance


<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>

### Actions

In [99]:
from optimus import Optimus 
op = Optimus("pandas") 
df = op.load.csv("foo.txt", sep=",")
df = df.cols.upper("*")

In [100]:
df.meta["transformations"]

{'actions': [[{'upper': ['name']}], [{'upper': ['function']}]]}

In [101]:
df.profile

name  1 (object)  not nullable,function  2 (object)  not nullable
OPTIMUS,⋅LEADER
BUMBLEBEE,⋅ESPIONAGE
EJECT,⋅ELECTRONIC⋅SURVEILLANCE


<bound method BaseDataFrame.profile of <class 'optimus.engines.pandas.dataframe.PandasDataFrame'>>

In [102]:
df.meta

{'file_name': 'foo.txt',
 'name': 'foo.txt',
 'transformations': {'actions': [[{'upper': ['name']}],
   [{'upper': ['function']}]]}}

## Internals

### Engine

In [103]:
from optimus import Optimus
op = Optimus("dask")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 9679 instead
  http_address["port"], self.http_server.port


In [104]:
op.client

0,1
Client  Scheduler: inproc://192.168.86.249/12148/10  Dashboard: http://192.168.86.249:9679/status,Cluster  Workers: 1  Cores: 8  Memory: 4.00 GB


### Dataframe

In [11]:

from optimus import Optimus
op = Optimus("pandas")
df = op.load.csv("foo.txt", sep=",")

In [12]:
df.save.csv("as")

In [7]:
df.data
type(df.data)

pandas.core.frame.DataFrame

In [127]:
import pandas as pd
type(pd.DataFrame({"A":["A",2,3]})["A"].str.lower())

pandas.core.series.Series

In [128]:
from optimus import Optimus
op = Optimus("pandas")
type(op.create.dataframe({"A":["A",2,3]}).cols.lower().data)

pandas.core.frame.DataFrame

In [18]:
engine = "ibis"
op = Optimus(engine)

In [19]:
op.client

<module 'ibis' from 'C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\ibis\\__init__.py'>

In [None]:
## Diagnostic

In [5]:
from optimus import Optimus
op = Optimus("spark")

In [6]:
op.client

In [16]:
op.load.csv("foo.txt")

name  1 (string)  not nullable,function  2 (string)  not nullable
Optimus,⋅leader
Bumblebee,⋅espionage
eject,⋅ELECTRONIC⋅SURVEILLANCE


<class 'optimus.engines.spark.dataframe.SparkDataFrame'>

In [11]:
df.cols._names()

NameError: name 'df' is not defined

In [12]:
list(df.data.columns)

NameError: name 'df' is not defined