# Docker Container

In [1]:
# docker run -it --name sparkr-jupy --mount type=bind,source=C:/Users/soumy/OneDrive/Coding,target=/app/data --rm -p 8888:8888 quay.io/jupyter/all-spark-notebook

# Library

In [2]:
library(SparkR)


Attaching package: ‘SparkR’


The following objects are masked from ‘package:stats’:

    cov, filter, lag, na.omit, predict, sd, var, window


The following objects are masked from ‘package:base’:

    as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
    rank, rbind, sample, startsWith, subset, summary, transform, union




# Creating spark session

In [3]:
sparkR.session()
if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
  Sys.setenv(SPARK_HOME = "/home/spark")
}
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
sparkR.session(master = "local[*]",
               sparkConfig = list(spark.driver.memory = "2g"))

“SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.”
Spark package found in SPARK_HOME: /usr/local/spark



Launching java with spark-submit command /usr/local/spark/bin/spark-submit   sparkr-shell /tmp/RtmpDVW8AT/backend_portc746b55fc4c 


Java ref type org.apache.spark.sql.SparkSession id 1 

“SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.”


Java ref type org.apache.spark.sql.SparkSession id 1 

# Read Data

In [4]:
df_raw <- loadDF(
  path = "/app/data/R/SparkR_test/SparkR_test/all_stocks_5yr.csv", # File path
  source = "csv", # file type
  header = "true", # data have header or not
  inferSchema = "true" # auto column type prediction
)
df <- repartition(df_raw, 10) # partition the data

showDF(df, 5)

+----------+-------+-------+-------+-------+--------+----+
|      date|   open|   high|    low|  close|  volume|Name|
+----------+-------+-------+-------+-------+--------+----+
|2014-05-05|   74.5|  74.66|  73.76|  74.51| 4348608| BAX|
|2016-05-26| 212.66|214.115| 210.23| 212.05|  422492| ADS|
|2015-06-17| 139.75|140.148| 139.08| 139.76|  650779| BDX|
|2015-09-21| 135.33| 136.12| 133.86| 134.87| 1059194| APD|
|2013-11-19|74.1471|74.7685|73.9956|74.2214|52234707|AAPL|
+----------+-------+-------+-------+-------+--------+----+
only showing top 5 rows


# Select columns

In [5]:
temp <- select(df, # Dataframe
  c("date", "open") # Column names to select
)

showDF(temp, 5)

+----------+------+
|      date|  open|
+----------+------+
|2016-12-22|123.72|
|2013-12-02| 23.55|
|2014-08-06|149.21|
|2017-07-26| 129.0|
|2016-08-16| 72.63|
+----------+------+
only showing top 5 rows


# Select rows

In [6]:
# Select first 3 rows
temp <- limit(df, 3)

showDF(temp)

+----------+------+-------+------+------+-------+----+
|      date|  open|   high|   low| close| volume|Name|
+----------+------+-------+------+------+-------+----+
|2014-05-05|  74.5|  74.66| 73.76| 74.51|4348608| BAX|
|2016-05-26|212.66|214.115|210.23|212.05| 422492| ADS|
|2015-06-17|139.75|140.148|139.08|139.76| 650779| BDX|
+----------+------+-------+------+------+-------+----+


# Filter Data

In [7]:
temp <- filter(df, df$open > 100 & df$low < 100)

showDF(temp, 5)

+----------+-------+------+-------+------+-------+----+
|      date|   open|  high|    low| close| volume|Name|
+----------+-------+------+-------+------+-------+----+
|2016-03-22| 100.08|100.37| 99.455| 99.93|1435782| AMT|
|2014-12-10| 100.22| 101.1|  99.97|100.17|1826854| AMT|
|2014-12-08|  104.0|104.17|99.6556|100.71|3484193|AVGO|
|2017-11-02| 100.96|102.45|  99.27|100.21| 630470| AIZ|
|2017-12-21|100.505|101.45|  99.82|101.12| 393034| AIZ|
+----------+-------+------+-------+------+-------+----+
only showing top 5 rows


# Schema

In [8]:
schema(df)

StructType
|-name = "date", type = "DateType", nullable = TRUE
|-name = "open", type = "DoubleType", nullable = TRUE
|-name = "high", type = "DoubleType", nullable = TRUE
|-name = "low", type = "DoubleType", nullable = TRUE
|-name = "close", type = "DoubleType", nullable = TRUE
|-name = "volume", type = "IntegerType", nullable = TRUE
|-name = "Name", type = "StringType", nullable = TRUE

# Pivot Longer

In [None]:
# Pivot Longer function
pivot_longer_sparkr <- function(df, id_cols, name_col, value_col) {
  library(SparkR)
  # Get all column names from the DataFrame
  all_cols <- colnames(df)
  # Identify the columns to pivot by excluding the specified columns
  cols_to_longer <- setdiff(all_cols, id_cols)
  # Use stack to pivot the columns
  long_df <- unpivot(df, id_cols, cols_to_longer, name_col, value_col)
  return(long_df)
}

# Applying the function
temp <- pivot_longer_sparkr(df, id_cols = c("date", "Name"),
                            name_col = "Metric", value_col = "Values")

showDF(temp, 5)

+----------+----+------+---------+
|      date|Name|Metric|   Values|
+----------+----+------+---------+
|2014-05-05| BAX|  open|     74.5|
|2014-05-05| BAX|  high|    74.66|
|2014-05-05| BAX|   low|    73.76|
|2014-05-05| BAX| close|    74.51|
|2014-05-05| BAX|volume|4348608.0|
+----------+----+------+---------+
only showing top 5 rows
