In [1]:
import findspark # here we are importing findspark module to locate the spark in the system
findspark.init() # here we are initializing the spark

import pyspark # here we are importing pyspark module
from pyspark.sql.types import * # here we are importing all the classes from the module
from pyspark.sql import functions as F # here we are importing functions from the module

sc = pyspark.SparkContext(appName="ENDSEM") # here we are creating a spark context

ss = pyspark.sql.SparkSession(sc) # here we are creating a spark session

dfr = ss.read # here we are reading the data from the file

schemaStruct = StructType()
schemaStruct.add("SYMBOL", StringType(), True)
schemaStruct.add("SERIES", StringType(), True)
schemaStruct.add("OPEN", DoubleType(), True)
schemaStruct.add("HIGH", DoubleType(), True)
schemaStruct.add("LOW", DoubleType(), True)
schemaStruct.add("CLOSE", DoubleType(), True)
schemaStruct.add("LAST", DoubleType(), True)
schemaStruct.add("PREVCLOSE", DoubleType(), True)
schemaStruct.add("TOTTRDQTY", LongType(), True)
schemaStruct.add("TOTTRDVAL", DoubleType(), True)
schemaStruct.add("TIMESTAMP", StringType(), True)
schemaStruct.add("ADDNL", StringType(), True)

# here we are reading the data from the file, and we are providing the schema to the data as well

df = dfr.csv("./nsedata.csv", schema=schemaStruct, header=True) # here we are reading the data from the file

24/11/21 10:28:43 WARN Utils: Your hostname, maverick resolves to a loopback address: 127.0.1.1; using 10.59.4.64 instead (on interface wlp0s20f3)
24/11/21 10:28:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/21 10:28:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Import necessary functions for processing the data
from pyspark.sql import functions as F

# Filter rows where SERIES == 'EQ'
df_eq = df.filter(df['SERIES'] == 'EQ')

# Calculate sample standard deviation of "OPEN" price for each stock (grouped by SYMBOL)
result = df_eq.groupBy("SYMBOL").agg(
    F.stddev_samp("OPEN").alias("sample_stddev")
)

# Filter out rows where sample_stddev is NULL
result_filtered = result.filter(result['sample_stddev'].isNotNull())

# Sort the results in ascending order based on sample standard deviation
result_sorted = result_filtered.orderBy("sample_stddev")

# Show the result in the console
result_sorted.show()


                                                                                

+----------+-------------------+
|    SYMBOL|      sample_stddev|
+----------+-------------------+
|     ASHCO| 0.0474341649025257|
|ANTGRAPHIC|0.08867592036759682|
|  DYNACONS|0.09995166357568239|
|  LICNMFET|0.12080734956455377|
|STEELTUBES|0.12866839377079198|
|   DYNASYS|0.14473621325905772|
|       ARL|0.14666293048558598|
|SRGINFOTEC|0.14751855655988164|
|  NIVINFRA|0.15434872662825802|
|TELEMARINE|0.16541619358413254|
|DCMFINSERV| 0.1943153470683162|
|  BIRLACOT|0.19607784047316004|
|PARASPETRO|0.19867396375347673|
|LCCINFOTEC|0.24022834961831302|
|    NANDAN| 0.3037950525512987|
|TELEDATAIT| 0.3637086874686068|
|  LLOYDFIN|0.36433095560963713|
|  BLUECHIP| 0.3818539875089487|
|FTCSF5YDIV|0.39483050671428355|
|     JCTEL|0.43513357404736036|
+----------+-------------------+
only showing top 20 rows



In [3]:
# Filter the rows where SERIES == 'EQ' and SYMBOL == 'SOUTHBANK'
df_southbank = df.filter((df['SERIES'] == 'EQ') & (df['SYMBOL'] == 'SOUTHBANK'))

# print(df_southbank.show())
# Calculate the sample standard deviation of the 'OPEN' price for SOUTHBANK
stddev_southbank = df_southbank.agg(F.stddev_samp('OPEN').alias('sample_stddev')).collect()

# Show the result
print(f"The sample standard deviation of the OPEN price for SOUTHBANK is: {stddev_southbank[0]['sample_stddev']}")

[Stage 3:==>                                                      (1 + 19) / 20]

The sample standard deviation of the OPEN price for SOUTHBANK is: 25.450003476301827


                                                                                

In [4]:
# Filter the rows where SERIES == 'EQ' and SYMBOL == 'SOUTHBANK'
df_southbank = df.filter((df['SYMBOL'] == 'SOUTHBANK'))

# print(df_southbank.show())
# Calculate the sample standard deviation of the 'OPEN' price for SOUTHBANK
stddev_southbank = df_southbank.agg(F.stddev_samp('OPEN').alias('sample_stddev')).collect()
southbank_stddev2 = stddev_southbank[0]['sample_stddev']

# Show the result
print(f"The sample standard deviation of the OPEN price for SOUTHBANK is: {stddev_southbank[0]['sample_stddev']}")

The sample standard deviation of the OPEN price for SOUTHBANK is: 25.358977154536202


The sample standard deviation of the OPEN price for SOUTHBANK (with symbol EQ only) is: 25.450003476301827
                                                                                
The sample standard deviation of the OPEN price for SOUTHBANK is: 25.358977154536202


In [5]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

# Filter the rows for SERIES == 'EQ' and SYMBOL == 'SOUTHBANK'
df_southbank = df.filter((df['SYMBOL'] == 'SOUTHBANK') & (df['SERIES'] == 'EQ'))

# Calculate the sample standard deviation of the 'OPEN' price for SOUTHBANK
stddev_southbank = df_southbank.agg(F.stddev_samp('OPEN').alias('sample_stddev')).collect()
southbank_stddev = stddev_southbank[0]['sample_stddev']

# Filter the dataset for SERIES == 'EQ' and exclude SOUTHBANK
df_eq = df.filter((df['SERIES'] == 'EQ') & (df['SYMBOL'] != 'SOUTHBANK'))

# Calculate the sample standard deviation for each stock
stddevs = df_eq.groupBy('SYMBOL').agg(F.stddev_samp('OPEN').alias('sample_stddev'))

# Remove rows where sample_stddev is NULL
stddevs_filtered = stddevs.filter(stddevs['sample_stddev'].isNotNull())

# Find the stock whose sample standard deviation is closest to that of SOUTHBANK
closest_stock = stddevs_filtered.withColumn('stddev_diff', F.abs(stddevs_filtered['sample_stddev'] - southbank_stddev))
closest_stock2 = stddevs_filtered.withColumn('stddev_diff', F.abs(stddevs_filtered['sample_stddev'] - southbank_stddev2))

# Sort by the smallest difference and show the closest stock (top 3)
result = closest_stock.orderBy('stddev_diff').limit(3)
result2 = closest_stock2.orderBy('stddev_diff').limit(3)

# Show the result
result.show()
result2.show()


24/11/21 10:28:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

+----------+------------------+-------------------+
|    SYMBOL|     sample_stddev|        stddev_diff|
+----------+------------------+-------------------+
|       DCM| 25.41755732203406|0.03244615426776676|
|DEEPAKFERT|25.518956403600654| 0.0689529272988274|
|KABRAEXTRU|25.538015324232415| 0.0880118479305878|
+----------+------------------+-------------------+





+----------+------------------+--------------------+
|    SYMBOL|     sample_stddev|         stddev_diff|
+----------+------------------+--------------------+
| FLEXITUFF|25.321819790015663| 0.03715736452053875|
|SPECIALITY|25.316802159300913|   0.042174995235289|
|       DCM| 25.41755732203406|0.058580167497858326|
+----------+------------------+--------------------+



                                                                                

In [6]:
from pyspark.sql import functions as F

# Filter the rows for SERIES == 'EQ' and SYMBOL == 'SOUTHBANK' or 'DCM'
df_southbank_dcm = df.filter((df['SERIES'] == 'EQ') & 
                             (df['SYMBOL'].isin('SOUTHBANK', 'DCM')))

# We need to join the data for SOUTHBANK and DCM on TIMESTAMP to compare their OPEN prices
df_joined = df_southbank_dcm.alias('df1') \
    .join(df_southbank_dcm.alias('df2'), on='TIMESTAMP', how='inner') \
    .filter((F.col('df1.SYMBOL') == 'SOUTHBANK') & (F.col('df2.SYMBOL') == 'DCM'))

# Calculate the absolute difference between the OPEN prices of SOUTHBANK and DCM
df_difference = df_joined.withColumn('abs_diff', F.abs(F.col('df1.OPEN') - F.col('df2.OPEN')))

# Count the number of days where the absolute difference of OPEN prices is less than 100
count_less_than_100 = df_difference.filter(df_difference['abs_diff'] < 100).count()

# Print the result
print(f"The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and DCM was less than 100 is: {count_less_than_100}")


                                                                                

The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and DCM was less than 100 is: 1013


In [7]:
from pyspark.sql import functions as F

# Filter the rows for SERIES == 'EQ' and SYMBOL == 'SOUTHBANK' or 'FLEXITUFF'
df_southbank_flexituff = df.filter((df['SERIES'] == 'EQ') & 
                                   (df['SYMBOL'].isin('SOUTHBANK', 'FLEXITUFF')))

# We need to join the data for SOUTHBANK and FLEXITUFF on TIMESTAMP to compare their OPEN prices
df_joined = df_southbank_flexituff.alias('df1') \
    .join(df_southbank_flexituff.alias('df2'), on='TIMESTAMP', how='inner') \
    .filter((F.col('df1.SYMBOL') == 'SOUTHBANK') & (F.col('df2.SYMBOL') == 'FLEXITUFF'))

# Calculate the absolute difference between the OPEN prices of SOUTHBANK and FLEXITUFF
df_difference = df_joined.withColumn('abs_diff', F.abs(F.col('df1.OPEN') - F.col('df2.OPEN')))

# Count the number of days where the absolute difference of OPEN prices is less than 100
count_less_than_100 = df_difference.filter(df_difference['abs_diff'] < 100).count()

# Print the result
print(f"The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and FLEXITUFF was less than 100 is: {count_less_than_100}")


                                                                                

The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and FLEXITUFF was less than 100 is: 0


In [8]:
import pandas as pd

# Load the data into a pandas DataFrame
df = pd.read_csv('./nsedata.csv')

# Filter rows where SERIES == 'EQ'
df_eq = df[df['SERIES'] == 'EQ']

# Calculate the sample standard deviation for each stock symbol based on the 'OPEN' price
stddev = df_eq.groupby('SYMBOL')['OPEN'].std()

# Remove rows with NaN (NULL) values in the standard deviation column
stddev = stddev.dropna()

# Sort the results by sample standard deviation in ascending order
stddev_sorted = stddev.sort_values()

# Display the result
print(stddev_sorted)


SYMBOL
ASHCO             0.047434
ANTGRAPHIC        0.088676
DYNACONS          0.099952
LICNMFET          0.120807
STEELTUBES        0.128668
                  ...     
PAGEIND        3896.806592
EICHERMOT      5552.851859
BOSCHLTD       5605.167544
MRF           10841.948052
ORISSAMINE    22957.451062
Name: OPEN, Length: 1873, dtype: float64


In [9]:
                                                                                
# +----------+-------------------+
# |    SYMBOL|      sample_stddev|
# +----------+-------------------+
# |     ASHCO| 0.0474341649025257|
# |ANTGRAPHIC|0.08867592036759682|
# |  DYNACONS|0.09995166357568239|
# |  LICNMFET|0.12080734956455377|
# |STEELTUBES|0.12866839377079198|
# |   DYNASYS|0.14473621325905772|
# |       ARL|0.14666293048558598|
# |SRGINFOTEC|0.14751855655988164|
# |  NIVINFRA|0.15434872662825802|
# |TELEMARINE|0.16541619358413254|
# |DCMFINSERV| 0.1943153470683162|
# |  BIRLACOT|0.19607784047316004|
# |PARASPETRO|0.19867396375347673|
# |LCCINFOTEC|0.24022834961831302|
# |    NANDAN| 0.3037950525512987|
# |TELEDATAIT| 0.3637086874686068|
# |  LLOYDFIN|0.36433095560963713|
# |  BLUECHIP| 0.3818539875089487|
# |FTCSF5YDIV|0.39483050671428355|
# |     JCTEL|0.43513357404736036|
# +----------+-------------------+
# only showing top 20 rows



In [10]:
import pandas as pd

# Load the data into a pandas DataFrame
df = pd.read_csv('./nsedata.csv')

# Filter the rows for the allotted stock "SOUTHBANK" and where SERIES is 'EQ'
df_southbank = df[(df['SYMBOL'] == 'SOUTHBANK') & (df['SERIES'] == 'EQ')]

# Calculate the sample standard deviation of the 'OPEN' price for SOUTHBANK
sample_stddev = df_southbank['OPEN'].std()

# Display the result
print(f"The sample standard deviation of the OPEN price for SOUTHBANK is: {sample_stddev}")


The sample standard deviation of the OPEN price for SOUTHBANK is: 25.450003476301827


In [11]:
import pandas as pd

# Load the data into a pandas DataFrame
df = pd.read_csv('./nsedata.csv')

# Filter the rows for SERIES == 'EQ' and SYMBOL == 'SOUTHBANK' or 'DCM'
df_southbank_dcm = df[(df['SERIES'] == 'EQ') & (df['SYMBOL'].isin(['SOUTHBANK', 'DCM']))]

# Merge the data for SOUTHBANK and DCM based on the TIMESTAMP column
df_joined = df_southbank_dcm[df_southbank_dcm['SYMBOL'] == 'SOUTHBANK'].merge(
    df_southbank_dcm[df_southbank_dcm['SYMBOL'] == 'DCM'],
    on='TIMESTAMP',
    suffixes=('_SOUTHBANK', '_DCM')
)

# Calculate the absolute difference between the OPEN prices of SOUTHBANK and DCM
df_joined['abs_diff'] = abs(df_joined['OPEN_SOUTHBANK'] - df_joined['OPEN_DCM'])

# Count the number of days where the absolute difference of OPEN prices is less than 100
count_less_than_100 = (df_joined['abs_diff'] < 100).sum()

# Print the result
print(f"The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and DCM was less than 100 is: {count_less_than_100}")


The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and DCM was less than 100 is: 1013


In [12]:
import pandas as pd

# Load the data into a pandas DataFrame
df = pd.read_csv('./nsedata.csv')

# Filter the rows for SERIES == 'EQ' and SYMBOL == 'SOUTHBANK' or 'FLEXITUFF'
df_southbank_dcm = df[(df['SERIES'] == 'EQ') & (df['SYMBOL'].isin(['SOUTHBANK', 'FLEXITUFF']))]

# Merge the data for SOUTHBANK and FLEXITUFF based on the TIMESTAMP column
df_joined = df_southbank_dcm[df_southbank_dcm['SYMBOL'] == 'SOUTHBANK'].merge(
    df_southbank_dcm[df_southbank_dcm['SYMBOL'] == 'FLEXITUFF'],
    on='TIMESTAMP',
    suffixes=('_SOUTHBANK', '_FLEXITUFF')
)

# Calculate the absolute difference between the OPEN prices of SOUTHBANK and DCM
df_joined['abs_diff'] = abs(df_joined['OPEN_SOUTHBANK'] - df_joined['OPEN_FLEXITUFF'])

# Count the number of days where the absolute difference of OPEN prices is less than 100
count_less_than_100 = (df_joined['abs_diff'] < 100).sum()

# Print the result
print(f"The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and FLEXITUFF was less than 100 is: {count_less_than_100}")


The count of days on which the absolute difference of the OPEN prices of SOUTHBANK and FLEXITUFF was less than 100 is: 0
