# Chapter One - Data Processing with Optimus 

## Hi Optimus! 

### One API to rule them all 

Some examples of how datasets are created without using Optimus

#### pandas

In [None]:
import pandas as pd 
import numpy as np 

data = {"A":[0, 1,2,3,4,5]} 
pdf = pd.DataFrame(data) 
pdf["A"] = np.sqrt(pdf["A"]) 
pdf 

#### cuDF

In [None]:
try:
    import cudf

    # You need to use dtype=”float” to return floats 

    cdf = cudf.DataFrame(data, dtype = "float") 
    cdf["A"] = cdf["A"].sqrt() 
except:
    cdf = "This could not run in your machine if "\
          "you don't have cuDF or any compatible "\
          "drivers installed."
cdf

#### dask

In [None]:
from dask import dataframe as dd  
# You cannot create a dataframe directly from a dict. You need to use a pandas dataframe. 
ddf = dd.from_pandas(pdf, npartitions=1) 

import dask.array as da 
ddf["A"]=da.sqrt(ddf.A) 

# You need to user compute to materialize the pandas dataframe 
ddf.compute() 

#### dask cuDF

In [None]:
try:
    import dask_cudf 

    dcdf = dask_cudf.from_cudf(cdf, npartitions=2) 
    dcdf.map_partitions(cudf.sqrt).compute()
except:
    dcdf = "This could not run in your machine if "\
           "you don't have Dask-cuDF or any compatible "\
           "drivers installed."
dcdf

#### vaex

In [None]:
import vaex 
import numpy as np 
  
data = {"A":[0, 1,2,3,4,5]}
vdf = vaex.from_dict(data)
vdf["A"] = np.sqrt(vdf["A"]) 
vdf 

## Using Optimus 

Instantiating Optimus

In [None]:
from optimus import Optimus 
op = Optimus("pandas")

Using a Dask remote cluster

In [None]:
from dask.distributed import Client 
client = Client("127.0.0.105") 
op = Optimus(engine="dask", session=client) 

Using Dask locally with a different number of workers

In [None]:
op = Optimus(engine="dask", n_workers=2)

Loading a file

In [None]:
op.load.csv("path/to/file.csv")

In [None]:
df = op.load.file("path/to/file.csv")
df

Connecting to a database or an external bucket and loading a file

In [None]:
db = op.connect.database(*db_args)
op.load.database_table("table name", connection=db) 

In [None]:
conn = op.connect.s3(*s3_args) 
op.load.file("relative/path/to/file.csv", connection=conn) 

Saving a file

In [None]:
df.save.csv("path/to/output-file.csv") 

In [None]:
df.save.database_table("table_name", db=db) 

In [None]:
df.save.csv("relative/path/to/output-file.csv", connection=conn) 

## The Optimus Dataframe

In [None]:
df.cols.rename("function", "job") 

In [None]:
df = df.cols.rename("function", "job")
df

In [None]:
df = df.cols.upper("name").cols.lower("job")
df

In [None]:
df.cols.drop("name") 

In [None]:
df.rows.drop(df["name"]=="MEGATRON") 

In [None]:
df.display()

In [None]:
dfn = op.create.dataframe({"A":["1",2,"4","!",None]})

In [None]:
dfn.cols.min("A"), dfn.cols.max("A")

In [None]:
df.cols.capitalize("name", output_cols="cap_name") 

In [None]:
df = op.create.dataframe({
    "A":["1",2,"4","!",None],
    "B":["Optimus","Bumblebee", "Eject", None, None]
})  
df.profile(bins=10) 

In [None]:
df.columns_sample("*") 

In [None]:
df.execute()

In [None]:
op = Optimus("pandas") 
df = op.load.csv("foo.txt", sep=",") 
type(df.data)

## Meta

In [None]:
from optimus import Optimus  
op = Optimus("pandas")  
df = op.load.csv("foo.txt", sep=",") 
df.meta 

### Actions

In [None]:
from optimus import Optimus  
op = Optimus("pandas")  
df = op.load.csv("foo.txt", sep=",") 
df = df.cols.upper("*") 

In [None]:
df.meta["transformations"] 