In [0]:
file_location = "/FileStore/tables/sharemarket.csv"

rdd = sc.textFile(file_location)

In [0]:
rdd.take(2)

Out[7]: ['N,N1,IRFC,BOND 8.00% PA TAX FREE S1,1086,1085,1085,1084.54,1085,3371959.44,3108, ,8,1194,1050',
 'N,N1,JNPT,BOND 6.82% PA TAX FREE S1,1001,1007.1,1015,1007.1,1015,51370,51, ,2,1529.99,1000']

In [0]:
rdd = rdd.map(lambda x : x.split(','))
rdd.take(3)

Out[8]: [['N',
  'N1',
  'IRFC',
  'BOND 8.00% PA TAX FREE S1',
  '1086',
  '1085',
  '1085',
  '1084.54',
  '1085',
  '3371959.44',
  '3108',
  ' ',
  '8',
  '1194',
  '1050'],
 ['N',
  'N1',
  'JNPT',
  'BOND 6.82% PA TAX FREE S1',
  '1001',
  '1007.1',
  '1015',
  '1007.1',
  '1015',
  '51370',
  '51',
  ' ',
  '2',
  '1529.99',
  '1000'],
 ['N',
  'N1',
  'NHAI',
  'BOND 8.20% PA TAX FREE S1',
  '1091.38',
  '1092',
  '1092',
  '1087',
  '1089.98',
  '3861145.49',
  '3544',
  ' ',
  '39',
  '1140',
  '1045']]

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
schema = StructType([ \
    StructField("MARKET",StringType(),True), \
    StructField("SERIES",StringType(),True), \
    StructField("SYMBOL",StringType(),True), \
    StructField("SECURITY", StringType(), True), \
    StructField("PREV_CL_PR", StringType(), True), \
    StructField("OPEN_PRICE", StringType(), True), \
    StructField("HIGH_PRICE", StringType(), True), \
    StructField("LOW_PRICE", StringType(), True), \
    StructField("CLOSE_PRICE", StringType(), True), \
    StructField("NET_TRDVAL", StringType(), True) ,\
    StructField("NET_TRDQTY", StringType(), True) ,\
    StructField("CORP_IND", StringType(), True), \
    StructField("TRADES", StringType(), True), \
    StructField("HI_52_WK", StringType(), True), \
    StructField("LO_52_WK", StringType(), True) \
  ])


In [0]:
df = spark.createDataFrame(rdd, schema)

In [0]:
df.show(5)

+------+------+------+--------------------+----------+----------+----------+---------+-----------+----------+----------+--------+------+--------+--------+
|MARKET|SERIES|SYMBOL|            SECURITY|PREV_CL_PR|OPEN_PRICE|HIGH_PRICE|LOW_PRICE|CLOSE_PRICE|NET_TRDVAL|NET_TRDQTY|CORP_IND|TRADES|HI_52_WK|LO_52_WK|
+------+------+------+--------------------+----------+----------+----------+---------+-----------+----------+----------+--------+------+--------+--------+
|     N|    N1|  IRFC|BOND 8.00% PA TAX...|      1086|      1085|      1085|  1084.54|       1085|3371959.44|      3108|        |     8|    1194|    1050|
|     N|    N1|  JNPT|BOND 6.82% PA TAX...|      1001|    1007.1|      1015|   1007.1|       1015|     51370|        51|        |     2| 1529.99|    1000|
|     N|    N1|  NHAI|BOND 8.20% PA TAX...|   1091.38|      1092|      1092|     1087|    1089.98|3861145.49|      3544|        |    39|    1140|    1045|
|     N|    N1|  NTPC|8.41%S-R-NCD SERI...|   1198.99|      1135|     

In [0]:
df = df.drop('CORP_IND')

In [0]:
df.show(5)

+------+------+------+--------------------+----------+----------+----------+---------+-----------+----------+----------+------+--------+--------+
|MARKET|SERIES|SYMBOL|            SECURITY|PREV_CL_PR|OPEN_PRICE|HIGH_PRICE|LOW_PRICE|CLOSE_PRICE|NET_TRDVAL|NET_TRDQTY|TRADES|HI_52_WK|LO_52_WK|
+------+------+------+--------------------+----------+----------+----------+---------+-----------+----------+----------+------+--------+--------+
|     N|    N1|  IRFC|BOND 8.00% PA TAX...|      1086|      1085|      1085|  1084.54|       1085|3371959.44|      3108|     8|    1194|    1050|
|     N|    N1|  JNPT|BOND 6.82% PA TAX...|      1001|    1007.1|      1015|   1007.1|       1015|     51370|        51|     2| 1529.99|    1000|
|     N|    N1|  NHAI|BOND 8.20% PA TAX...|   1091.38|      1092|      1092|     1087|    1089.98|3861145.49|      3544|    39|    1140|    1045|
|     N|    N1|  NTPC|8.41%S-R-NCD SERI...|   1198.99|      1135|      1135|     1135|       1135|    113500|       100|    

In [0]:
df.createOrReplaceTempView("ShareMarket")

In [0]:
spark.sql("select * from ShareMarket limit 5").show()


+------+------+------+--------------------+----------+----------+----------+---------+-----------+----------+----------+------+--------+--------+
|MARKET|SERIES|SYMBOL|            SECURITY|PREV_CL_PR|OPEN_PRICE|HIGH_PRICE|LOW_PRICE|CLOSE_PRICE|NET_TRDVAL|NET_TRDQTY|TRADES|HI_52_WK|LO_52_WK|
+------+------+------+--------------------+----------+----------+----------+---------+-----------+----------+----------+------+--------+--------+
|     N|    N1|  IRFC|BOND 8.00% PA TAX...|      1086|      1085|      1085|  1084.54|       1085|3371959.44|      3108|     8|    1194|    1050|
|     N|    N1|  JNPT|BOND 6.82% PA TAX...|      1001|    1007.1|      1015|   1007.1|       1015|     51370|        51|     2| 1529.99|    1000|
|     N|    N1|  NHAI|BOND 8.20% PA TAX...|   1091.38|      1092|      1092|     1087|    1089.98|3861145.49|      3544|    39|    1140|    1045|
|     N|    N1|  NTPC|8.41%S-R-NCD SERI...|   1198.99|      1135|      1135|     1135|       1135|    113500|       100|    

In [0]:
# 1.Query to display the number of series present in the data.(using hive)

result1 = spark.sql("select count(distinct SERIES) as Total_Count from ShareMarket")
result1.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output1.txt")

In [0]:
# 2.Display the series present in the data.(using hive)

result2 = spark.sql("select distinct SERIES from ShareMarket ")
result2.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output2.txt")

In [0]:
# 3.Find the sum of all the prices in the each series.(Using hive)

result3  = spark.sql("select SERIES, sum(PREV_CL_PR) , sum(OPEN_PRICE) , sum(HIGH_PRICE) , sum(LOW_PRICE) , sum(CLOSE_PRICE) from ShareMarket group by SERIES")
result3.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output3.txt")

In [0]:
# 4.Display security,series with highest net trade value(use pyspark)
result4 = spark.sql("select SECURITY, SERIES, NET_TRDVAL from ShareMarket where NET_TRDVAL = (select max(NET_TRDVAl) from ShareMarket) ")
result4.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output4.txt")

In [0]:
# 5.Display the series whose sum of all prices greater than the net trade value.(Using pyspark)

result5 = spark.sql("select SERIES, round(PREV_CL_PR + OPEN_PRICE + HIGH_PRICE + LOW_PRICE + CLOSE_PRICE) as Total_price, NET_TRDVAL from ShareMarket where (PREV_CL_PR + OPEN_PRICE + HIGH_PRICE + LOW_PRICE + CLOSE_PRICE) > NET_TRDVAL ")
result5.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output5.txt")

In [0]:
# 6.Display the series with highest net trade quantity.(Using pyspark)
result6 = spark.sql("select series from ShareMarket where NET_TRDQTY = (select max(NET_TRDQTY) from ShareMarket)")
result6.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output6.txt")

In [0]:
# 7. Display the highest and lowest open price(Using sql)
result7 = spark.sql("select max(OPEN_PRICE) as max_open_price, min(OPEN_PRICE) as min_open_price from ShareMarket")
result7.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output7.txt")

In [0]:
# 8.Query to display the series which have trades more than 80.(Using SQL).

result8 = spark.sql("select series from ShareMarket where Trades  > 80")
result8.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output8.txt")

In [0]:
# 9.Display the difference between the net trade value net trade quantity for each series.(Using sql).
result9 = spark.sql("select series, sum(net_trdval - net_trdqty) as difference from ShareMarket group by SERIES ")
result9.coalesce(1).write.format("csv").option("header", "true").mode("append").save("/FileStore/tables/output9.txt")