In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ada').master('local[*]').getOrCreate()

In [63]:
from pyspark.sql.functions import round, max, min

raw_df = spark.read.csv('ADA-INR.csv', header=True, inferSchema=True)

df = raw_df.select(raw_df['Date'], 
                       round(raw_df['Open'], 2).alias('Open'),
                       round(raw_df['High'], 2).alias('High'),
                       round(raw_df['Low'], 2).alias('Low'),
                       round(raw_df['Close'], 2).alias('Close'),
                       raw_df['Volume']
                      )

In [67]:
df = df.orderBy(df["Open"].desc())
df.show()

+----------+------+------+------+------+-------------+
|      Date|  Open|  High|   Low| Close|       Volume|
+----------+------+------+------+------+-------------+
|2021-05-17|168.56|170.37|141.21| 148.9| 824305968603|
|2021-05-16|159.12| 180.4|147.53|169.21| 917759147869|
|2021-05-18| 148.6|156.97|144.63|146.52| 493238877940|
|2021-05-15|147.04| 173.8|144.11|159.31|1137134724941|
|2021-05-19|146.74|148.31| 75.27|108.56|1194550682967|
|2021-05-14|141.55|149.64|133.28|146.91| 560893232556|
|2021-05-21|131.63|136.45| 96.74|113.18| 660279170969|
|2021-05-27|129.44|130.12|117.33|119.63| 310123951164|
|2021-05-10|129.44|133.74|112.34|120.89| 475056418044|
|2021-05-12| 129.4|133.55|114.41|114.41| 393348294249|
|2021-06-02|126.62|131.65|125.52|130.03| 278171582464|
|2021-06-01|126.02| 129.0|120.96|126.39| 300453686668|
|2021-05-07|121.37|128.11|112.64|121.18| 536821673294|
|2021-05-08|121.26|121.26|114.64|118.71| 290191552227|
|2021-05-11|120.67| 130.5|117.39|129.44| 398323356188|
|2021-05-2

In [86]:
diff_df = df.withColumn("diff", round(df["Close"]-df["Open"], 2))
diff_df = diff_df.orderBy(diff_df["diff"].desc())

diff_df.show()

+----------+------+------+------+------+-------------+-----+
|      Date|  Open|  High|   Low| Close|       Volume| diff|
+----------+------+------+------+------+-------------+-----+
|2021-05-13|115.57|144.68|110.91|142.02|1065943180819|26.45|
|2021-05-20|107.78|141.99|  97.2| 132.2| 860823260502|24.42|
|2021-05-24| 95.97|116.46| 92.62|112.86| 536509494888|16.89|
|2021-05-26|113.17|133.32|113.17|129.55| 482465403143|16.38|
|2021-03-16| 74.93| 91.28| 72.88| 90.13| 689114371026| 15.2|
|2021-05-05| 93.77| 110.2| 93.02|108.89| 354017130756|15.12|
|2021-05-30|101.66|122.27| 97.86|114.19| 360142630151|12.53|
|2021-05-06|109.08|125.11|105.81|121.47| 786187756197|12.39|
|2021-05-15|147.04| 173.8|144.11|159.31|1137134724941|12.27|
|2021-05-31|114.07| 126.9|110.57|126.33| 325548134822|12.26|
|2021-05-09|118.76|134.12|115.51|129.77| 591125146906|11.01|
|2021-04-26| 81.62| 93.47| 81.02| 92.32| 239489815158| 10.7|
|2021-05-16|159.12| 180.4|147.53|169.21| 917759147869|10.09|
|2021-03-17| 90.18|102.6

In [98]:
total_days = diff_df.count()
profitable_days = diff_df.filter(diff_df["diff"] > 0.0).count()
loss_days = diff_df.filter(diff_df["diff"] <= 0.0).count()

print("Total Days : "+str(total_days))
print("Profit Days : "+str(profitable_days))
print("Loss Days : "+str(loss_days))


Total Days :93
Profit Days :48
Loss Days :45
