# Chapter Two - Data Loading, Saving and file Formats

In [None]:
from optimus import Optimus
op = Optimus("pandas")

## Loading a file

In [None]:
df = op.load.file("my_file.json")
display(df)

Fixed number of rows

In [None]:
df = op.load.csv("data/file.csv", n_rows=5) 
display(df)

Without a header

In [None]:
df = op.load.csv("data/file.csv", header=None) 
display(df)

Assigning which value will be assumed as null

In [None]:
df = op.load.csv("data/file.csv", null_value="Null")
display(df)

Using a specific separator

In [None]:
df = op.load.csv("data/file.csv", sep=";")
display(df)

Wildcards

In [None]:
df = op.load.csv("csv/*")
display(df)

In [None]:
df = op.load.csv("csv/file-*.csv") 
display(df)

In [None]:
df = op.load.csv("csv/file-?.csv")
display(df)

In [None]:
df = op.load.csv("csv/file-*[0-9].*.csv")
display(df)

Loading large files

In [None]:
op = Optimus("dask") 
df = op.load.csv("s3://my-storage/massive-file.csv")

From a remote connection

In [None]:
conn = op.connect.s3(endpoint_url="s3://my-storage/") 
df = op.load.csv("files/foo_file.csv", conn=conn) 

In [None]:
df2 = op.load.file("files/file_bar.xml", conn=conn) 

From a database

In [None]:
db = op.connect.mysql(host="localhost", database="my_database")
df = op.load.database_table("foo_table", conn=db)

### Memory usage

In [None]:
op = Optimus("dask") 
df = op.create.dataframe({ 
    "a": [1000,2000,3000,4000,5000]*10000, 
    "b": [1,2,3,4,5]*10000 
}) 
df.size() 

In [None]:
df = df.optimize()
df.size() 

## Creating a dataframe from scratch

In [None]:
df = op.create.dataframe({ 
    "name":["OPTIMUS", "BUMBLEBEE", "EJECT"], 
    "function":["Leader", "Espionage", " Electronic Surveillance"] 
})
df

## Saving a dataframe

To a local file

In [None]:
df.save.csv("foo_output.csv", sep=";")

To a remote connection

In [None]:
df.save.xml("files/foo_output.xml", conn=conn) 

To a database table

In [None]:
df.save.database_table("foo_output_table", db=db) 

## Repartition

In [None]:
df = df.repartition(2)