In [1]:
#https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook
#docker run -v "$PWD":/home/jovyan/work -p 8888:8888 jupyter/pyspark-notebook

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ada').master('local[*]').getOrCreate()

In [10]:
from pyspark.sql.functions import round, max, min

raw_df = spark.read.csv('ADA-INR.csv', header=True, inferSchema=True)

df = raw_df.select(raw_df['Date'], 
                       round(raw_df['Open'], 2).alias('Open'),
                       round(raw_df['High'], 2).alias('High'),
                       round(raw_df['Low'], 2).alias('Low'),
                       round(raw_df['Close'], 2).alias('Close'),
                       raw_df['Volume']
                      )

raw_df.show(1000)

+----------+----------+----------+----------+----------+----------+-------------+
|      Date|      Open|      High|       Low|     Close| Adj Close|       Volume|
+----------+----------+----------+----------+----------+----------+-------------+
|2020-06-02|  6.115325|  6.233381|  5.549793|  5.889389|  5.889389|  44193849302|
|2020-06-03|  5.894048|  6.455020|  5.814571|  6.435926|  6.435926|  42063885739|
|2020-06-04|  6.430285|  6.787288|  6.377632|  6.715764|  6.715764|  44777270285|
|2020-06-05|  6.719553|  6.731802|  6.376356|  6.456381|  6.456381|  27030732557|
|2020-06-06|  6.456912|  6.587191|  6.436296|  6.500499|  6.500499|  17441407946|
|2020-06-07|  6.501228|  6.695285|  6.273045|  6.543957|  6.543957|  23949835909|
|2020-06-08|  6.547653|  6.571022|  6.385729|  6.517197|  6.517197|  16048840006|
|2020-06-09|  6.518255|  6.524736|  6.298049|  6.298049|  6.298049|  16093010309|
|2020-06-10|  6.297122|  6.340122|  6.035568|  6.321566|  6.321566|  24718885748|
|2020-06-11|  6.

In [4]:
df = df.orderBy(df["Open"].desc())
df.show()

+----------+------+------+------+------+-------------+
|      Date|  Open|  High|   Low| Close|       Volume|
+----------+------+------+------+------+-------------+
|2021-05-17|168.56|170.37|141.21| 148.9| 824305968603|
|2021-05-16|159.12| 180.4|147.53|169.21| 917759147869|
|2021-05-18| 148.6|156.97|144.63|146.52| 493238877940|
|2021-05-15|147.04| 173.8|144.11|159.31|1137134724941|
|2021-05-19|146.74|148.31| 75.27|108.56|1194550682967|
|2021-05-14|141.55|149.64|133.28|146.91| 560893232556|
|2021-05-21|131.63|136.45| 96.74|113.18| 660279170969|
|2021-05-27|129.44|130.12|117.33|119.63| 310123951164|
|2021-05-10|129.44|133.74|112.34|120.89| 475056418044|
|2021-05-12| 129.4|133.55|114.41|114.41| 393348294249|
|2021-06-02|126.62|131.65|125.52|128.16| 277639593984|
|2021-06-01|126.02| 129.0|120.96|126.39| 300453686668|
|2021-05-07|121.37|128.11|112.64|121.18| 536821673294|
|2021-05-08|121.26|121.26|114.64|118.71| 290191552227|
|2021-05-11|120.67| 130.5|117.39|129.44| 398323356188|
|2021-05-2

In [9]:
diff_df = df.withColumn("diff", round(df["Close"]-df["Open"], 2))
diff_df = diff_df.orderBy(diff_df["diff"])

diff_df.show(100)

+----------+------+------+------+------+-------------+------+
|      Date|  Open|  High|   Low| Close|       Volume|  diff|
+----------+------+------+------+------+-------------+------+
|2020-10-12|  null|  null|  null|  null|         null|  null|
|2020-10-09|  null|  null|  null|  null|         null|  null|
|2020-10-13|  null|  null|  null|  null|         null|  null|
|2021-05-19|146.74|148.31| 75.27|108.56|1194550682967|-38.18|
|2021-05-17|168.56|170.37|141.21| 148.9| 824305968603|-19.66|
|2021-05-21|131.63|136.45| 96.74|113.18| 660279170969|-18.45|
|2021-05-12| 129.4|133.55|114.41|114.41| 393348294249|-14.99|
|2021-02-23| 79.61| 82.14| 59.67|  69.1| 810819933353|-10.51|
|2021-05-28|119.97| 121.7|105.31|109.46| 388979650750|-10.51|
|2021-05-23|106.87|112.26|  77.0| 96.64| 677892268569|-10.23|
|2021-03-18|100.03|106.79| 89.92| 90.04| 827920184497| -9.99|
|2021-05-27|129.44|130.12|117.33|119.63| 310123951164| -9.81|
|2021-05-10|129.44|133.74|112.34|120.89| 475056418044| -8.55|
|2021-04

In [6]:
total_days = diff_df.count()
profitable_days = diff_df.filter(diff_df["diff"] > 0.0).count()
loss_days = diff_df.filter(diff_df["diff"] <= 0.0).count()

print("Total Days : "+str(total_days))
print("Profit Days : "+str(profitable_days))
print("Loss Days : "+str(loss_days))


Total Days : 366
Profit Days : 194
Loss Days : 169


In [7]:
# If I buy daily at Open and sell at Close 
# todo : with 10k budget
from pyspark.sql.functions import sum

total_profit = diff_df.filter(diff_df["diff"] > 0.0).select(sum("diff").alias("total_profit"))
total_loss = diff_df.filter(diff_df["diff"] < 0.0).select(sum("diff").alias("total_loss"))

total_profit.show()
total_loss.show()

# print("Total Profit : "+str(total_profit["total_profit"]))
# print("Total Loss : "+str(total_loss["total_loss"]))


+------------------+
|      total_profit|
+------------------+
|463.95000000000005|
+------------------+

+-------------------+
|         total_loss|
+-------------------+
|-341.24000000000007|
+-------------------+

