In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('Stock-Data-Analysis').getOrCreate()

23/03/28 16:51:24 WARN Utils: Your hostname, PJ-Ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.0.86 instead (on interface wlo1)
23/03/28 16:51:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/28 16:51:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark

In [5]:
# Read the dataset 
df = spark.read.option('header', 'true').csv("./Data/FS_sp500_Value.csv").drop("_c0")
zts_df = df.select("*").where("ticker == 'ZTS'")
zts_df.tail(10)

                                                                                

[Row(Ticker='ZTS', Date='2022-04-01', High='191.5500030517578', Low='186.6999969482422', Open='188.77000427246094', Close='191.11000061035156', Volume='1831400', Adj Close='191.11000061035156'),
 Row(Ticker='ZTS', Date='2022-04-04', High='191.97999572753906', Low='187.17999267578125', Open='191.13999938964844', Close='188.74000549316406', Volume='2603600', Adj Close='188.74000549316406'),
 Row(Ticker='ZTS', Date='2022-04-05', High='190.74000549316406', Low='187.69000244140625', Open='188.74000549316406', Close='188.39999389648438', Volume='2059300', Adj Close='188.39999389648438'),
 Row(Ticker='ZTS', Date='2022-04-06', High='192.3300018310547', Low='186.52000427246094', Open='188.16000366210938', Close='191.3800048828125', Volume='2843500', Adj Close='191.3800048828125'),
 Row(Ticker='ZTS', Date='2022-04-07', High='199.33999633789062', Low='190.97000122070312', Open='191.1999969482422', Close='197.92999267578125', Volume='2781400', Adj Close='197.92999267578125'),
 Row(Ticker='ZTS', Da

In [30]:
# Analysis functions 
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def calcSimpleMovingAvg(df, col, span):
  # 14 day periods
  window = Window.partitionBy("Ticker").orderBy("Date").rowsBetween(-span, 0)
  df = df.withColumn("moving_avg", F.avg(col).over(window))
  return df.rdd.map(lambda x: x.moving_avg).collect()

def calcDailyPercentChange(df):
  window = Window.partitionBy("Ticker").orderBy("Date")
  df = df.withColumn("prev_close", F.lag(df.Close).over(window))
  df = df.withColumn("change", F.when(F.isnull( (df.Close - df.prev_close)/df.prev_close ), 0).otherwise(F.round(F.abs((df.Close - df.prev_close)/df.prev_close) *100, 3)) )
  return df.rdd.map(lambda x: x.change).collect()

def calcATR(df):
  window = Window.partitionBy("Ticker").orderBy("Date")
  df = df.withColumn("prev_close", F.lag(df.Close).over(window))
  df = df.withColumn("h-l", df.High-df.Low)
  df = df.withColumn("h-p", F.when(F.isnull( F.abs(df.High-df.prev_close)), 0).otherwise( F.abs(df.High-df.prev_close))) 
  df = df.withColumn("l-p", F.when(F.isnull( F.abs(df.Low-df.prev_close)), 0).otherwise( F.abs(df.Low-df.prev_close))) 
  df = df.withColumn("true_range", F.greatest("h-l", "h-p", "l-p"))
  return calcSimpleMovingAvg(df, "true_range", 14)

def calcRSI(df):
  window = Window.partitionBy("Ticker").orderBy("Date")
  df = df.withColumn("prev_close", F.lag(df.Close).over(window))
  df = df.withColumn("change", F.when(F.isnull(df.Close - df.prev_close), 0).otherwise(df.Close - df.prev_close))

  window = Window.partitionBy("Ticker").orderBy("Date").rowsBetween(-14, 0)
  df = df.withColumn("change_up", F.when(df.change < 0, 0).otherwise(df.change))
  df = df.withColumn("change_down", F.when(df.change > 0, 0).otherwise(df.change))
  df = df.withColumn("avg_up", F.avg(df.change_up).over(window)).drop(df.change_up)
  df = df.withColumn("avg_down", F.avg(df.change_down).over(window)).drop(df.change_down)
  df = df.withColumn("rsi", F.round((100 * df.avg_up / (df.avg_up + F.abs(df.avg_down))), 4))
  return df.rdd.map(lambda x: x.rsi).collect()

def calcVPT(df):
  window = Window.partitionBy("Ticker").orderBy("Date")
  df = df.withColumn("prev_close", F.lag(df.Close).over(window))
  df = df.withColumn("temp_vpt", F.when(F.isnull((df.Volume * (df.Close - df.prev_close))/df.prev_close), 0).otherwise((df.Volume * (df.Close - df.prev_close))/df.prev_close))
  df = df.withColumn("prev_vpt", F.lag(df.temp_vpt).over(window))
  df = df.withColumn("vpt", F.when(F.isnull( df.prev_vpt), df.temp_vpt).otherwise(df.temp_vpt + df.prev_vpt)).drop(df.temp_vpt).drop(df.prev_vpt)
  return df.rdd.map(lambda x: x.vpt).collect()

In [31]:
# print("Moving Average closing of ZTS:", calcMovingAvg(zts_df, 'Close'))
# print("Daily Percent Change of ZTS: ", calcDailyPercentChange(zts_df))
# print("ATR of ZTS: ", calcATR(zts_df))
# print("VPT of ZTS: ", calcVPT(zts_df))
print("RSI of ZTS", calcRSI(zts_df))



RSI of ZTS [None, 100.0, 100.0, 75.0, 99.0099, 99.5146, 99.5595, 99.6364, 93.1972, 85.0931, 87.7862, 84.7666, 66.474, 64.486, 64.684, 58.983, 58.6149, 60.356, 66.5753, 67.3797, 59.7305, 57.8867, 50.8474, 57.204, 55.6944, 51.5198, 49.8529, 60.9215, 66.4038, 55.6591, 55.0725, 55.0, 57.4097, 50.1441, 37.9479, 35.5725, 36.3914, 41.2323, 40.7643, 38.6707, 37.5951, 39.6469, 33.9763, 28.4566, 45.6522, 55.0847, 52.8455, 46.789, 47.8102, 42.8105, 55.832, 52.8358, 48.0406, 44.6677, 50.613, 43.5897, 37.5813, 48.1715, 53.8202, 48.3668, 47.3077, 51.5228, 50.7519, 49.8747, 54.8209, 47.4016, 52.8861, 58.5987, 53.6662, 52.9231, 62.3656, 72.6514, 70.6278, 66.2372, 62.963, 50.2294, 51.7778, 45.6863, 52.2124, 60.6796, 50.4711, 43.4279, 39.9481, 47.3424, 46.1446, 44.6407, 45.7177, 38.5732, 38.5731, 38.0214, 40.4309, 34.6253, 36.2162, 31.9885, 25.7862, 31.3575, 30.7692, 37.2727, 26.1569, 42.0593, 40.0332, 39.9668, 44.4241, 36.6412, 32.8767, 31.5271, 42.4471, 50.9972, 50.1449, 48.9614, 49.4012, 59.2063, 58.

                                                                                