# Connect to Dremio with R (use this as a template)

In [None]:
################################################
#              Connect to Dremio               #
################################################
# Don't edit this
if (!require(odbc)) { install.packages("odbc"); require("odbc") }
if (!require(getPass)) { install.packages("getPass"); require("getPass") }
require(DBI)

dremio_host <- 'dremio-client.dremio.svc.cluster.local'
dremio_port <- 31010
dremio_driver <- Sys.getenv('DREMIO_DRIVER')

cnxn <- DBI::dbConnect(
    odbc::odbc(), 
    driver = "Dremio ODBC Driver 64-bit",
    uid = getPass::getPass(prompt = "Dremio Username: "), 
    pwd = getPass::getPass(prompt = "Dremio Password: "), 
    host = dremio_host, 
    port = dremio_port,
    AuthenticationType = "Basic Authentication", 
    ConnectionType = "Direct"
)

print("Connected.")

## An overview of what's available

In [None]:
print("Catalogs:")
sql = "SELECT * FROM INFORMATION_SCHEMA.CATALOGS LIMIT 5"
request <- dbSendQuery(cnxn, sql)
df <- dbFetch(request, n = 100)
df

In [None]:
print("Tables")
dbListTables(con)

In [None]:
print("Columns:")
sql = "SELECT * FROM INFORMATION_SCHEMA.COLUMNS LIMIT 5"
request <- dbSendQuery(cnxn, sql)
df <- dbFetch(request, n = 100)
df

In [None]:
dbListFields(cnxn, "dremiosharedstorage.shared.\"12100121.csv\"")

In [None]:
# If you want to close the connection
# close(channel)

In [None]:
# For more commands, see the SQL Reference
# https://docs.dremio.com/sql-reference/

################################################
#           End of Connect to Dremio           #
################################################

# Get started with your analysis!

In [None]:
sql = "SELECT * FROM dremiosharedstorage.shared.\"12100121.csv\""
request <- dbSendQuery(cnxn, sql)
df <- dbFetch(request)
df

In [None]:
# The upstream data that I'm using didn't label the columns, so I have to.
colnames(df) <- df[1,]
df <- df[-1, ] 

In [None]:
names(df)

In [None]:
library(dplyr)

In [None]:
values = df %>% select(VALUE)
# string to number
values <- as.data.frame(lapply(values, as.numeric))

In [None]:
mean(values$VALUE)

In [None]:
sd(values$VALUE)

In [None]:
v = values$VALUE
#hist(c(values$VALUE), "Values", breaks = 20)
hist(v, main="legend", breaks = 100)

In [None]:
# Exclude outliers

hist(v[v < 4000], main="legend", xlim=c(-50, 4000), breaks = 60)