# Exponential Smoothing: SystemDS

Notes & Scripts
___
___

In [None]:
export JAVA_HOME=`/usr/libexec/java_home -v 11`
java -version

In [None]:
export SPARK_HOME="/opt/homebrew/opt/spark"

-exec option, must be one of [hadoop, singlenode, hybrid, HYBRID, spark]

In [None]:
-stats

___
___
## Dataset: wind_speed

In [3]:
import numpy as np 

# Get the data in a numpy array
data = np.genfromtxt('/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/wind_turbine_scada.csv', delimiter=',', skip_header=1)

# Slice the data to only get the wind speed column
wind_speed = data[:, 2]
wind_speed = wind_speed[~np.isnan(wind_speed)]

# Replication factor
factor = 500
base_wind_speed = wind_speed

# Concatenate the wind_speed array 'factor' times
for i in range(factor - 1):
    wind_speed = np.concatenate((wind_speed, base_wind_speed))

# Save the wind_speed array to a CSV file
np.savetxt('/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/wind_speed.csv', wind_speed, delimiter=',')

print(f"Saved dataframe wind_speed has {wind_speed.size} rows.")

Saved dataframe wind_speed has 25265000 rows.


___
### #1

In [None]:
# Read the CSV file into a frame
wind_speed = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/wind_speed.csv", format="csv", header=TRUE, sep=",")

# Print the size of the data
print("Dataframe wind_speed has " + toString(nrow(wind_speed)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = wind_speed[1]

# Start timing
start_time = time()

# Perform exponential smoothing on the data
for (i in 2:nrow(wind_speed)) {
  smoothed_value = alpha * wind_speed[i] + (1 - alpha) * smoothed_value
}

# End timing and calculate execution time
end_time = time()
function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

# Print the results
print('### Dataset wind_speed')
print('### #1 Basic For Loop \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
# Read the CSV file into a frame
wind_speed = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/wind_speed.csv", format="csv", header=TRUE, sep=",", rows=25264999, cols=1)

# Print the size of the data
print("Dataframe wind_speed has " + toString(nrow(wind_speed)) + " rows.")

# Parameters
alpha = 0.7

# Vector to store execution times
number_of_executions = 10
execution_times = numeric(number_of_executions)

# Initialize smoothed_value
smoothed_value = wind_speed[1]

# Perform exponential smoothing on the data
for (i in 1:number_of_executions) {
    start_time <- time()

    # Perform exponential smoothing on the data
    for (i in 2:nrow(wind_speed)) {
    smoothed_value = alpha * wind_speed[i] + (1 - alpha) * smoothed_value
    }

    # End timing and calculate execution time
    end_time = time()
    function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

    # Stop the timer
    end_time <- Sys.time()
    execution_times[i] <- as.numeric(difftime(end_time, start_time, units = "secs"))
}

# Calculate the function time
function_time <- median(execution_times)

# Print the results
print('### Dataset wind_speed')
print('### #1 Basic For Loop \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/wind_speed/windspeed_experiment3.dml -exec singlenode

In [None]:
# New Run without -stats
Dataframe wind_speed has 25264999 rows.
1133.814014792
1133.425738708
### Dataset wind_speed
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 9,729

The function was executed in 1133.61987675 seconds

In [None]:
Dataframe wind_speed has 25264999 rows.

### Dataset wind_speed
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 9,729

The function was executed in 1113.519333333 seconds     # 18 minutes

SystemDS Statistics:
Total elapsed time:		1127,025 sec.
Total compilation time:		0,315 sec.
Total execution time:		1126,710 sec.
Cache hits (Mem/Li/WB/FS/HDFS):	101059994/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	0/0/0/0.
Cache times (ACQr/m, RLS, EXP):	16,096/1,618/3,913/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/25264999.
HOP DAGs recompile time:	1010,638 sec.
Total JIT compile time:		6.899 sec.
Total JVM GC count:		267.
Total JVM GC time:		1.573 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)     Count
  1  createvar     25,641  75794996
  2  +*            24,696  25264998
  3  rightIndex    17,697  25264999
  4  nrow          13,110         2
  5  *             12,769  25264998
  6  rmvar          7,397  50530010
  7  mvvar          4,962  25265003
  8  toString       0,068         1
  9  -              0,007         2
 10  +              0,004         7

___
### #2

In [None]:
# Read the CSV file into a frame
wind_speed = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/wind_speed.csv", format="csv", header=TRUE, sep=",")

# Extract the wind speed column
wind_speed = data

# Print the size of the data
print("Dataframe wind_speed has " + toString(nrow(wind_speed)) + " rows.")

# Parameters
alpha = 0.7

# Start timing
start_time = time()

# Vectorized exponential smoothing
n = nrow(wind_speed)
weights = rev(alpha * (1 - alpha) ^ seq(0, n-1, 1))
smoothed = rev(cumsum(weights * wind_speed))
smoothed_value = smoothed[1] / sum(weights)

# End timing and calculate execution time
end_time = time()
function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

# Print the results
print('### Dataset wind_speed')
print('### #2 Vectorized \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')


In [None]:
# Read the CSV file into a frame
wind_speed = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/wind_speed.csv", format="csv", header=TRUE, sep=",", rows=25264999, cols=1)

# Print the size of the data
print("Dataframe wind_speed has " + toString(nrow(wind_speed)) + " rows.")

# Parameters
alpha = 0.7

# Vector to store execution times
number_of_executions = 10
execution_times = matrix(0, number_of_executions, 1)

# Initialize smoothed_value
smoothed_value = wind_speed[1]

# Perform exponential smoothing on the data
for (i in 1:number_of_executions) {
    start_time = time()

    # Vectorized exponential smoothing
    n = nrow(wind_speed)
    weights = rev(alpha * (1 - alpha) ^ seq(0, n-1, 1))
    smoothed = rev(cumsum(weights * wind_speed))
    smoothed_value = smoothed[1] / sum(weights)

    # End timing and calculate execution time
    end_time = time()
    function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds
    print(function_time)
    execution_times[i, 1] = function_time
}

# Calculate the function time
function_time <- avg(execution_times)

# Print the results
print('### Dataset wind_speed')
print('### #2 Vectorized \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/wind_speed/windspeed_experiment4.dml -exec singlenode

In [None]:
# New Run without -stats

Dataframe wind_speed has 25264999 rows.
3.93455125
3.384092208
3.316342208
3.485522208
3.425112125
3.322466666
3.326216833
3.384802458
3.427035
3.327018541

### Dataset wind_speed
### #2 Vectorized 

The last smoothed value for wind_speed is: 9,729

The function was executed in 3.4333159497 seconds

In [None]:
Dataframe wind_speed has 25264999 rows.

### Dataset wind_speed
### #2 Vectorized 

The last smoothed value for wind_speed is: 9,729

The function was executed in 3.79856075 seconds

SystemDS Statistics:
Total elapsed time:		13,974 sec.
Total compilation time:		0,266 sec.
Total execution time:		13,709 sec.
Cache hits (Mem/Li/WB/FS/HDFS):	11/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	1/6/0/0.
Cache times (ACQr/m, RLS, EXP):	9,898/0,000/0,002/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/2.
HOP DAGs recompile time:	0,009 sec.
Total JIT compile time:		1.706 sec.
Total JVM GC count:		21.
Total JVM GC time:		0.159 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)  Count
  1  nrow           9,902      1
  2  ^              2,830      1
  3  *              0,421      2
  4  ucumk+         0,184      1
  5  seq            0,133      1
  6  rev            0,118      2
  7  uak+           0,099      1
  8  -              0,007      3
  9  toString       0,004      1
 10  rmvar          0,003     22

___
### #3

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/wind_speed/windspeed_experiment3.dml 

In [None]:
# New Run without -stats

Dataframe wind_speed has 25264999 rows.

51.312511375
53.538804916

### Dataset wind_speed
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 9,729

The function was executed in 52.425658145499995 seconds
SystemDS Statistics:
Total execution time:           117,079 sec.
Number of executed Spark inst:  0.

In [None]:
Dataframe wind_speed has 25264999 rows.

### Dataset wind_speed
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 9,729

The function was executed in 1130.060546416 seconds     # 19 minutes

SystemDS Statistics:
Total elapsed time:		1139,188 sec.
Total compilation time:		0,408 sec.
Total execution time:		1138,779 sec.
Number of compiled Spark inst:	8.
Number of executed Spark inst:	0.
Cache hits (Mem/Li/WB/FS/HDFS):	101059994/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	0/1/0/0.
Cache times (ACQr/m, RLS, EXP):	12,929/2,179/5,395/0,000 sec.
HOP DAGs recompiled (PRED, SB):	1/25265002.
HOP DAGs recompile time:	1024,849 sec.
Spark ctx create time (lazy):	0,000 sec.
Spark trans counts (par,bc,col):0/0/0.
Spark trans times (par,bc,col):	0,000/0,000/0,000 secs.
Total JIT compile time:		6.586 sec.
Total JVM GC count:		278.
Total JVM GC time:		1.4 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)     Count
  1  createvar     25,370  75794998
  2  +*            25,244  25264998
  3  rightIndex    18,479  25264999
  4  *             12,859  25264998
  5  sp_csvrblk     8,638         1
  6  rmvar          7,475  50530002
  7  mvvar          6,140  25265011
  8  toString       0,015         1
  9  -              0,008         2
 10  time           0,002         2


___
### #4

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/wind_speed/windspeed_experiment4.dml 

In [None]:
# New Run without -stats

Dataframe wind_speed has 25264999 rows.
3.7066065
3.390889375
3.371399875
3.28144475
3.326555458
3.301074333
3.314003417
3.406505208
3.304629166
3.314920708
### Dataset wind_speed
### #2 Vectorized 

The last smoothed value for wind_speed is: 9,729

The function was executed in 3.371802879 seconds
SystemDS Statistics:
Total execution time:           46,782 sec.
Number of executed Spark inst:  0.

In [None]:
Dataframe wind_speed has 25264999 rows.

### Dataset wind_speed
### #2 Vectorized 

The last smoothed value for wind_speed is: 9,729

The function was executed in 3.914272 seconds

SystemDS Statistics:
Total elapsed time:		18,274 sec.
Total compilation time:		1,292 sec.
Total execution time:		16,983 sec.
Number of compiled Spark inst:	12.
Number of executed Spark inst:	0.
Cache hits (Mem/Li/WB/FS/HDFS):	11/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	1/7/0/0.
Cache times (ACQr/m, RLS, EXP):	13,050/0,000/0,003/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/3.
HOP DAGs recompile time:	0,011 sec.
Spark ctx create time (lazy):	0,577 sec.
Spark trans counts (par,bc,col):0/0/0.
Spark trans times (par,bc,col):	0,000/0,000/0,000 secs.
Total JIT compile time:		3.915 sec.
Total JVM GC count:		14.
Total JVM GC time:		0.262 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)  Count
  1  sp_csvrblk    13,056      1
  2  ^              2,813      1
  3  *              0,509      2
  4  ucumk+         0,197      1
  5  rev            0,140      2
  6  seq            0,140      1
  7  uak+           0,108      1
  8  -              0,007      3
  9  toString       0,004      1
 10  rmvar          0,001     19


___
___

## Dataset: energy_generation_solar 

In [6]:
import numpy as np 

# Get the data in a numpy array
data = np.genfromtxt('/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/energy_dataset.csv', delimiter=',', skip_header=1)

# Slice the data to only get the energy_dataset column
energy_dataset = data[:, 18]
energy_dataset = energy_dataset[~np.isnan(energy_dataset)]

# Replication factor
factor = 1400
base_energy_dataset = energy_dataset

# Concatenate the wind_speed array 'factor' times
for i in range(factor - 1):
    energy_dataset = np.concatenate((energy_dataset, base_energy_dataset))

# Save the wind_speed array to a CSV file
np.savetxt('/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/energy_generation_solar.csv', energy_dataset, delimiter=',')

print(f"Saved dataframe energy_dataset has {energy_dataset.size} rows.")

Saved dataframe energy_dataset has 49064400 rows.


___
### #1

In [None]:
# Read the CSV file into a frame
energy_generation_solar = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/energy_generation_solar.csv", format="csv", header=TRUE, sep=",")

# Print the size of the data
print("Dataframe energy_generation_solar has " + toString(nrow(energy_generation_solar)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = energy_generation_solar[1]

# Start timing
start_time = time()

# Perform exponential smoothing on the data
for (i in 2:nrow(energy_generation_solar)) {
  smoothed_value = alpha * energy_generation_solar[i] + (1 - alpha) * smoothed_value
}

# End timing and calculate execution time
end_time = time()
function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

# Print the results
print('### Dataset energy_generation_solar')
print('### #1 Basic For Loop \n')
print('The last smoothed value for energy_generation_solar is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
# Read the CSV file into a frame
energy_generation_solar = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/energy_generation_solar.csv", format="csv", header=TRUE, sep=",", rows=49064400, cols=1)

# Print the size of the data
print("Dataframe energy_generation_solar has " + toString(nrow(energy_generation_solar)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = energy_generation_solar[1]

# Vector to store execution times
number_of_executions = 2
execution_times = matrix(0, number_of_executions, 1)

# Perform exponential smoothing on the data
for (i in 1:number_of_executions) {
    start_time = time()

    # Perform exponential smoothing on the data
    for (j in 2:nrow(energy_generation_solar)) {
        smoothed_value = alpha * energy_generation_solar[j] + (1 - alpha) * smoothed_value
    }

    # End timing and calculate execution time
    end_time = time()
    function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds
    print(function_time)
    execution_times[i, 1] = function_time
}

# Calculate the function time
function_time = avg(execution_times)

# Print the results
print('### Dataset energy_generation_solar')
print('### #1 Basic For Loop \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/energy_generation/energy_experiment3.dml -exec singlenode

In [None]:
# New Run without -stats

Dataframe energy_generation_solar has 49064399 rows.

2221.434682917
2227.429070292

### Dataset energy_generation_solar
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 33,090

The function was executed in 2224.4318766045 seconds

In [None]:
Dataframe energy_generation_solar has 49064399 rows.

### Dataset energy_generation_solar
### #1 Basic For Loop 

The last smoothed value for energy_generation_solar is: 33,090

The function was executed in 2156.596293625 seconds   # 36 minutes
SystemDS Statistics:
Total elapsed time:		2173,938 sec.
Total compilation time:		0,289 sec.
Total execution time:		2173,649 sec.
Cache hits (Mem/Li/WB/FS/HDFS):	196257594/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	0/0/0/0.
Cache times (ACQr/m, RLS, EXP):	24,023/4,192/8,546/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/49064399.
HOP DAGs recompile time:	1957,299 sec.
Total JIT compile time:		6.965 sec.
Total JVM GC count:		525.
Total JVM GC time:		2.426 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)      Count
  1  +*            47,093   49064398
  2  createvar     46,813  147193196
  3  rightIndex    32,321   49064399
  4  *             23,832   49064398
  5  rmvar         20,621   98128810
  6  nrow          17,018          2
  7  mvvar          9,298   49064403
  8  toString       0,023          1
  9  -              0,007          2
 10  +              0,004          7


___
### #2

In [None]:
# Read the CSV file into a frame
energy_generation_solar = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/energy_generation_solar.csv", format="csv", header=TRUE, sep=",")

# Print the size of the data
print("Dataframe energy_generation_solar has " + toString(nrow(energy_generation_solar)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = energy_generation_solar[1]

# Start timing
start_time = time()

# Vectorized exponential smoothing
n = nrow(energy_generation_solar)
weights = rev(alpha * (1 - alpha) ^ seq(0, n-1, 1))
smoothed = rev(cumsum(weights * energy_generation_solar))
smoothed_value = smoothed[1] / sum(weights)

# End timing and calculate execution time
end_time = time()
function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

# Print the results
print('### Dataset energy_generation_solar')
print('### #2 Vectorized \n')
print('The last smoothed value for energy_generation_solar is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
# Read the CSV file into a frame
energy_generation_solar = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/energy_generation_solar.csv", format="csv", header=TRUE, sep=",", rows=49064400, cols=1)

# Print the size of the data
print("Dataframe energy_generation_solar has " + toString(nrow(energy_generation_solar)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = energy_generation_solar[1]

# Vector to store execution times
number_of_executions = 10
execution_times = matrix(0, number_of_executions, 1)

# Perform exponential smoothing on the data
for (i in 1:number_of_executions) {
    start_time = time()

    # Vectorized exponential smoothing
    n = nrow(energy_generation_solar)
    weights = rev(alpha * (1 - alpha) ^ seq(0, n-1, 1))
    smoothed = rev(cumsum(weights * energy_generation_solar))
    smoothed_value = smoothed[1] / sum(weights)

    # End timing and calculate execution time
    end_time = time()
    function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds
    print(function_time)
    execution_times[i, 1] = function_time
}

# Calculate the function time
function_time <- avg(execution_times)

# Print the results
print('### Dataset energy_generation_solar')
print('### #2 Vectorized \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/energy_generation/energy_experiment4.dml -exec singlenode

In [None]:
# New Run without -stats

Dataframe energy_generation_solar has 49064399 rows.

8.881965375
7.247304834
7.131209625
6.9798055
6.872244708
7.195350875
7.021430667
6.968126208
7.003603083
7.237802125

### Dataset energy_generation_solar
### #2 Vectorized 

The last smoothed value for wind_speed is: 33,090

The function was executed in 7.2538843 seconds

In [None]:
Dataframe energy_generation_solar has 49064399 rows.

### Dataset energy_generation_solar
### #2 Vectorized 

The last smoothed value for energy_generation_solar is: 33,090

The function was executed in 7.740103042 seconds

SystemDS Statistics:
Total elapsed time:		32,441 sec.
Total compilation time:		0,299 sec.
Total execution time:		32,142 sec.
Cache hits (Mem/Li/WB/FS/HDFS):	11/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	1/6/0/0.
Cache times (ACQr/m, RLS, EXP):	24,376/0,000/0,007/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/2.
HOP DAGs recompile time:	0,037 sec.
Total JIT compile time:		3.079 sec.
Total JVM GC count:		20.
Total JVM GC time:		0.247 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)  Count
  1  nrow          24,380      1
  2  ^              5,589      1
  3  ucumk+         0,645      1
  4  *              0,524      2
  5  rev            0,413      2
  6  uak+           0,306      1
  7  seq            0,219      1
  8  +              0,014      7
  9  -              0,008      3
 10  rightIndex     0,004      1

___
### #3

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/energy_generation/energy_experiment3.dml

In [None]:
# New Run without -stats

Dataframe energy_generation_solar has 49064400 rows.

102.580072542
112.456740583

### Dataset energy_generation_solar
### #1 Basic For Loop 

The last smoothed value for energy_generation_solar is: 9,927

The function was executed in 107.51840656249999 seconds
SystemDS Statistics:
Total execution time:           236,813 sec.
Number of executed Spark inst:  0.

In [None]:
Dataframe energy_generation_solar has 49064399 rows.

### Dataset energy_generation_solar
### #1 Basic For Loop 

The last smoothed value for energy_generation_solar is: 33,090

The function was executed in 2265.317802125 seconds   # 38 minutes

SystemDS Statistics:
Total elapsed time:		2287,721 sec.
Total compilation time:		0,451 sec.
Total execution time:		2287,270 sec.
Number of compiled Spark inst:	8.
Number of executed Spark inst:	0.
Cache hits (Mem/Li/WB/FS/HDFS):	196257594/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	0/1/0/0.
Cache times (ACQr/m, RLS, EXP):	29,322/4,627/9,539/0,000 sec.
HOP DAGs recompiled (PRED, SB):	1/49064402.
HOP DAGs recompile time:	2063,036 sec.
Spark ctx create time (lazy):	0,000 sec.
Spark trans counts (par,bc,col):0/0/0.
Spark trans times (par,bc,col):	0,000/0,000/0,000 secs.
Total JIT compile time:		8.472 sec.
Total JVM GC count:		516.
Total JVM GC time:		2.596 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)      Count
  1  +*            49,741   49064398
  2  createvar     48,842  147193198
  3  rightIndex    36,587   49064399
  4  *             25,001   49064398
  5  sp_csvrblk    21,863          1
  6  rmvar         13,095   98128802
  7  mvvar          7,665   49064411
  8  toString       0,018          1
  9  -              0,009          2
 10  time           0,002          2


___
### #4

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/energy_generation/energy_experiment4.dml

In [None]:
# New Run without -stats

Dataframe energy_generation_solar has 49064400 rows.
8.415589958
6.630199458
6.666481542
6.907580833
6.992634542
6.743489
6.789508666
6.863836583
6.690859
6.607737917
### Dataset energy_generation_solar
### #2 Vectorized 

The last smoothed value for wind_speed is: 9,927

The function was executed in 6.9307917499 seconds
SystemDS Statistics:
Total execution time:           89,965 sec.
Number of executed Spark inst:  0.

In [None]:
Dataframe energy_generation_solar has 49064399 rows.

### Dataset energy_generation_solar
### #2 Vectorized 

The last smoothed value for energy_generation_solar is: 33,090

The function was executed in 8.207912375 seconds

SystemDS Statistics:
Total elapsed time:		29,302 sec.
Total compilation time:		1,221 sec.
Total execution time:		28,081 sec.
Number of compiled Spark inst:	12.
Number of executed Spark inst:	0.
Cache hits (Mem/Li/WB/FS/HDFS):	11/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	1/7/1/0.
Cache times (ACQr/m, RLS, EXP):	19,843/0,000/0,740/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/3.
HOP DAGs recompile time:	0,025 sec.
Spark ctx create time (lazy):	0,578 sec.
Spark trans counts (par,bc,col):0/0/0.
Spark trans times (par,bc,col):	0,000/0,000/0,000 secs.
Total JIT compile time:		3.334 sec.
Total JVM GC count:		21.
Total JVM GC time:		0.291 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)  Count
  1  sp_csvrblk    19,851      1
  2  ^              5,707      1
  3  ucumk+         1,055      1
  4  *              0,595      2
  5  uak+           0,291      1
  6  rev            0,285      2
  7  seq            0,253      1
  8  -              0,006      3
  9  rmvar          0,005     18
 10  toString       0,004      1


___
___

## Dataset: heart_rate

In [7]:
import numpy as np 

# Get the data in a numpy array
data = np.genfromtxt('/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/heartrate_seconds_merged.csv', delimiter=',', skip_header=1)

# Slice the data to only get the energy_dataset column
heart_rate = data[:, 2]
heart_rate = heart_rate[~np.isnan(heart_rate)]

# Replication factor
factor = 90
base_heart_rate = heart_rate

# Concatenate the wind_speed array 'factor' times
for i in range(factor - 1):
    heart_rate = np.concatenate((heart_rate, base_heart_rate))

# Save the wind_speed array to a CSV file
np.savetxt('/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/heart_rate.csv', heart_rate, delimiter=',')

print(f"Saved dataframe heart_rate has {heart_rate.size} rows.")

Saved dataframe heart_rate has 103921290 rows.


___
### #1

In [None]:
# Single Run

# Read the CSV file into a frame
heart_rate = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/heart_rate.csv", format="csv", header=TRUE, sep=",", rows=103921290, cols=1)

# Print the size of the data
print("Dataframe heart_rate has " + toString(nrow(heart_rate)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = heart_rate[1]

# Start timing
start_time = time()

# Perform exponential smoothing on the data
for (i in 2:nrow(heart_rate)) {
  smoothed_value = alpha * heart_rate[i] + (1 - alpha) * smoothed_value
}

# End timing and calculate execution time
end_time = time()
function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

# Print the results
print('### Dataset heart_rate')
print('### #1 Basic For Loop \n')
print('The last smoothed value for heart_rate is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
# Multiple Runs

# Read the CSV file into a frame
heart_rate = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/heart_rate.csv", format="csv", header=TRUE, sep=",", rows=103921290, cols=1)

# Print the size of the data
print("Dataframe heart_rate has " + toString(nrow(heart_rate)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = heart_rate[1]

# Vector to store execution times
number_of_executions = 2
execution_times = matrix(0, number_of_executions, 1)

# Perform exponential smoothing on the data
for (i in 1:number_of_executions) {
    start_time = time()

    # Perform exponential smoothing on the data
    for (j in 2:nrow(heart_rate)) {
        smoothed_value = alpha * heart_rate[j] + (1 - alpha) * smoothed_value
    }

    # End timing and calculate execution time
    end_time = time()
    function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds
    print(function_time)
    execution_times[i, 1] = function_time
}

# Calculate the function time
function_time = avg(execution_times)

# Print the results
print('### Dataset heart_rate')
print('### #1 Basic For Loop \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/heart_rate/hr_experiment3.dml -exec singlenode

In [None]:
# New Run without -stats

Dataframe heart_rate has 103921289 rows.

4595.563075042
4572.6757565

### Dataset heart_rate
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 98,778

The function was executed in 4584.119415771 seconds

In [None]:
Dataframe heart_rate has 103921289 rows.

### Dataset heart_rate
### #1 Basic For Loop 

The last smoothed value for heart_rate is: 98,778

The function was executed in 4582.313366125 seconds     # 76 minutes

SystemDS Statistics:
Total elapsed time:		4630,489 sec.
Total compilation time:		0,337 sec.
Total execution time:		4630,152 sec.
Cache hits (Mem/Li/WB/FS/HDFS):	415685154/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	0/0/0/0.
Cache times (ACQr/m, RLS, EXP):	62,608/6,769/16,628/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/103921289.
HOP DAGs recompile time:	4163,696 sec.
Total JIT compile time:		7.297 sec.
Total JVM GC count:		1082.
Total JVM GC time:		5.337 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)      Count
  1  createvar    108,146  311763866
  2  +*            97,272  103921288
  3  rightIndex    68,063  103921289
  4  *             48,838  103921288
  5  nrow          47,757          2
  6  rmvar         36,506  207842590
  7  mvvar         21,832  103921293
  8  toString       0,047          1
  9  +              0,018          7
 10  -              0,008          2

___
### #2

In [None]:
# Read the CSV file into a frame
heart_rate = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/heart_rate.csv", format="csv", header=TRUE, sep=",", rows=103921290, cols=1)

# Print the size of the data
print("Dataframe heart_rate has " + toString(nrow(heart_rate)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = heart_rate[1]

# Start timing
start_time = time()

# Vectorized exponential smoothing
n = nrow(heart_rate)
weights = rev(alpha * (1 - alpha) ^ seq(0, n-1, 1))
smoothed = rev(cumsum(weights * heart_rate))
smoothed_value = smoothed[1] / sum(weights)

# End timing and calculate execution time
end_time = time()
function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds

# Print the results
print('### Dataset heart_rate')
print('### #1 Basic For Loop \n')
print('The last smoothed value for heart_rate is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
# Read the CSV file into a frame
heart_rate = read("/Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/data_files/prepocessed_datafiles/heart_rate.csv", format="csv", header=TRUE, sep=",", rows=103921290, cols=1)

# Print the size of the data
print("Dataframe heart_rate has " + toString(nrow(heart_rate)) + " rows.")

# Parameters
alpha = 0.7

# Initialize smoothed_value
smoothed_value = heart_rate[1]

# Vector to store execution times
number_of_executions = 10
execution_times = matrix(0, number_of_executions, 1)

# Perform exponential smoothing on the data
for (i in 1:number_of_executions) {
    start_time = time()

    # Vectorized exponential smoothing
    n = nrow(heart_rate)
    weights = rev(alpha * (1 - alpha) ^ seq(0, n-1, 1))
    smoothed = rev(cumsum(weights * heart_rate))
    smoothed_value = smoothed[1] / sum(weights)

    # End timing and calculate execution time
    end_time = time()
    function_time = (end_time - start_time) / 1000000000 # Convert nanoseconds to seconds
    print(function_time)
    execution_times[i, 1] = function_time
}

# Calculate the function time
function_time <- avg(execution_times)

# Print the results
print('### Dataset heart_rate')
print('### #1 Basic For Loop \n')
print('The last smoothed value for wind_speed is: ' + toString(smoothed_value))
print('The function was executed in ' + toString(function_time) + ' seconds')

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/heart_rate/hr_experiment4.dml -exec singlenode

In [None]:
# New Run without -stats

Dataframe heart_rate has 103921289 rows.

19.149284084
17.080834125
16.240216375
15.887738625
15.558642542
15.554881666
15.2110245
15.661771958
15.327253833
15.048907625

### Dataset heart_rate
### #2 Vectorized 

The last smoothed value for wind_speed is: 98,778

The function was executed in 16.072055533300002 seconds

In [None]:
Dataframe heart_rate has 103921289 rows.

### Dataset heart_rate
### #1 Basic For Loop 

The last smoothed value for heart_rate is: 98,778

The function was executed in 16.197294208 seconds

SystemDS Statistics:
Total elapsed time:		55,973 sec.
Total compilation time:		0,274 sec.
Total execution time:		55,700 sec.
Cache hits (Mem/Li/WB/FS/HDFS):	11/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	1/6/5/0.
Cache times (ACQr/m, RLS, EXP):	39,488/0,000/0,618/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/2.
HOP DAGs recompile time:	0,022 sec.
Total JIT compile time:		1.882 sec.
Total JVM GC count:		41.
Total JVM GC time:		0.249 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)  Count
  1  nrow          39,492      1
  2  ^             11,773      1
  3  *              1,262      2
  4  ucumk+         1,086      1
  5  rev            0,926      2
  6  uak+           0,574      1
  7  seq            0,547      1
  8  -              0,007      3
  9  rmvar          0,006     21
 10  toString       0,004      1


___
### #3

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/heart_rate/hr_experiment3.dml

In [None]:
# New Run without -stats

Dataframe heart_rate has 103921289 rows.

4522.076746792
4569.310003709

### Dataset heart_rate
### #1 Basic For Loop 

The last smoothed value for wind_speed is: 98,778

The function was executed in 4545.6933752505 seconds

In [None]:
Dataframe heart_rate has 103921289 rows.

### Dataset heart_rate
### #3 For Loop 

The last smoothed value for heart_rate is: 98,778

The function was executed in 4637.079888875 seconds   # 77 minutes

SystemDS Statistics:
Total elapsed time:		4683,562 sec.
Total compilation time:		0,411 sec.
Total execution time:		4683,152 sec.
Number of compiled Spark inst:	8.
Number of executed Spark inst:	0.
Cache hits (Mem/Li/WB/FS/HDFS):	415685154/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	0/1/0/0.
Cache times (ACQr/m, RLS, EXP):	60,577/7,298/18,413/0,000 sec.
HOP DAGs recompiled (PRED, SB):	1/103921292.
HOP DAGs recompile time:	4226,406 sec.
Spark ctx create time (lazy):	0,000 sec.
Spark trans counts (par,bc,col):0/0/0.
Spark trans times (par,bc,col):	0,000/0,000/0,000 secs.
Total JIT compile time:		10.718 sec.
Total JVM GC count:		1091.
Total JVM GC time:		5.234 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)      Count
  1  createvar    104,203  311763868
  2  +*            94,436  103921288
  3  rightIndex    75,585  103921289
  4  *             49,593  103921288
  5  sp_csvrblk    45,948          1
  6  rmvar         27,169  207842582
  7  mvvar         18,915  103921301
  8  toString       0,023          1
  9  -              0,009          2
 10  +              0,004          7


___
### #4

In [None]:
java -Xmx7g -Xms7g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar" \
     org.apache.sysds.api.DMLScript \
     -f /Users/niklas/Documents/GitHub/Uni/10_Masterarbeit/systemDS_Scripts/heart_rate/hr_experiment4.dml

In [None]:
# New Run without -stats

Dataframe heart_rate has 103921289 rows.

20.793400541
17.764628916
18.353478125
16.967458917
17.003416208
16.24676275
16.053267083
16.778378583
16.063659917
16.149777709

### Dataset heart_rate
### #2 Vectorized 

The last smoothed value for wind_speed is: 98,778

The function was executed in 17.2174228749 seconds

In [None]:
Dataframe heart_rate has 103921289 rows.

### Dataset heart_rate
### #4 For Loop 

The last smoothed value for heart_rate is: 98,778

The function was executed in 18.972938417 seconds

SystemDS Statistics:
Total elapsed time:		65,439 sec.
Total compilation time:		1,238 sec.
Total execution time:		64,201 sec.
Number of compiled Spark inst:	12.
Number of executed Spark inst:	0.
Cache hits (Mem/Li/WB/FS/HDFS):	11/0/0/0/1.
Cache writes (Li/WB/FS/HDFS):	1/7/6/0.
Cache times (ACQr/m, RLS, EXP):	45,198/0,000/3,404/0,000 sec.
HOP DAGs recompiled (PRED, SB):	0/3.
HOP DAGs recompile time:	0,016 sec.
Spark ctx create time (lazy):	0,583 sec.
Spark trans counts (par,bc,col):0/0/0.
Spark trans times (par,bc,col):	0,000/0,000/0,000 secs.
Total JIT compile time:		3.796 sec.
Total JVM GC count:		31.
Total JVM GC time:		0.491 sec.
Heavy hitter instructions:
  #  Instruction  Time(s)  Count
  1  sp_csvrblk    45,209      1
  2  ^             14,188      1
  3  rev            1,311      2
  4  *              1,247      2
  5  ucumk+         1,089      1
  6  uak+           0,652      1
  7  seq            0,472      1
  8  -              0,008      3
  9  toString       0,007      1
 10  rmvar          0,004     18

___
___

## Warnings & Errors:

In [None]:
WARN utils.SettingsChecker: Low memory budget of total:   8192 GB set to:      8 GB
WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

In [None]:
ERROR spark.SparkContext: Error initializing SparkContext.
org.apache.spark.SparkException: A master URL must be set in your configuration
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:414)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext.createContext(SparkExecutionContext.java:277)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext.initSparkContext(SparkExecutionContext.java:248)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext.getSparkContextStatic(SparkExecutionContext.java:163)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext$SparkClusterConfig.analyzeSparkParallelismConfiguation(SparkExecutionContext.java:1975)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext$SparkClusterConfig.analyzeSparkConfiguation(SparkExecutionContext.java:1949)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext$SparkClusterConfig.<init>(SparkExecutionContext.java:1870)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext.getSparkClusterConfig(SparkExecutionContext.java:1780)
	at org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext.getBroadcastMemoryBudget(SparkExecutionContext.java:1790)
	at org.apache.sysds.hops.OptimizerUtils.checkSparkBroadcastMemoryBudget(OptimizerUtils.java:585)
	at org.apache.sysds.hops.UnaryOp.constructCumOffBinary(UnaryOp.java:320)
	at org.apache.sysds.hops.UnaryOp.constructLopsSparkCumulativeUnary(UnaryOp.java:308)
	at org.apache.sysds.hops.UnaryOp.constructLops(UnaryOp.java:168)
	at org.apache.sysds.hops.ReorgOp.constructLops(ReorgOp.java:171)
	at org.apache.sysds.hops.IndexingOp.constructLops(IndexingOp.java:142)
	at org.apache.sysds.hops.BinaryOp.constructLopsBinaryDefault(BinaryOp.java:444)
	at org.apache.sysds.hops.BinaryOp.constructLops(BinaryOp.java:237)
	at org.apache.sysds.hops.DataOp.constructLops(DataOp.java:311)
	at org.apache.sysds.parser.DMLTranslator.constructLops(DMLTranslator.java:435)
	at org.apache.sysds.parser.DMLTranslator.constructLops(DMLTranslator.java:348)
	at org.apache.sysds.api.DMLScript.execute(DMLScript.java:457)
	at org.apache.sysds.api.DMLScript.executeScript(DMLScript.java:319)
	at org.apache.sysds.api.DMLScript.main(DMLScript.java:205)

In [None]:
java -Xmx8g -Xms8g -cp "./lib/*:/Users/niklas/Documents/GitHub/systemds/target/SystemDS.jar"      
org.apache.sysds.api.DMLScript      
-f /Users/niklas/Documents/SystemDS/systemds-3.2.0-bin/experiments/heart_rate/hr_experiment2.dml 
-exec hadoop -stats
24/06/20 10:17:12 ERROR api.DMLScript: 
Parsing Exception Invalid argument specified for -exec option, must be one of [hadoop, singlenode, hybrid, HYBRID, spark]

___
Gelöst:

In [None]:
Exception in thread "main" java.lang.NoClassDefFoundError: scala/collection/immutable/List
	at org.apache.sysds.lops.Checkpoint.<clinit>(Checkpoint.java:43)
	at org.apache.sysds.hops.Hop.constructAndSetCheckpointLopIfRequired(Hop.java:494)
	at org.apache.sysds.hops.Hop.constructAndSetLopsDataFlowProperties(Hop.java:427)
	at org.apache.sysds.hops.DataOp.constructLops(DataOp.java:333)
	at org.apache.sysds.hops.DataOp.constructLops(DataOp.java:311)
	at org.apache.sysds.parser.DMLTranslator.constructLops(DMLTranslator.java:435)
	at org.apache.sysds.parser.DMLTranslator.constructLops(DMLTranslator.java:348)
	at org.apache.sysds.api.DMLScript.execute(DMLScript.java:457)
	at org.apache.sysds.api.DMLScript.executeScript(DMLScript.java:319)
	at org.apache.sysds.api.DMLScript.main(DMLScript.java:205)
Caused by: java.lang.ClassNotFoundException: scala.collection.immutable.List
	at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:641)
	at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:188)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:526)
	... 10 more