In [1]:
# # **Loading Hive Tables and Data Preparation for Analysis**

import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql.functions import * 
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Stock Analysis").config(
    "spark.ui.port", "0").config(
        "spark.sql.catalogImplementation=hive").config(
        "spark.sql.warehouse.dir",
        "hdfs://nameservice1/user/itv003722/warehouse").config(
            "spark.serializer",
    "org.apache.spark.serializer.KryoSerializer").enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel('OFF')

In [4]:
SA = spark.table('bdsh_project.stock_detail')

In [5]:
SA.createOrReplaceTempView('SA')

In [6]:
SA.show()

+----+-----+------+--------------------+-----------+-----------+--------------------+-----+-----+-----+-----+--------+
|year|month|symbol|         companyname|      state|     sector|        sub_industry| open|close|  low| high|  volume|
+----+-----+------+--------------------+-----------+-----------+--------------------+-----+-----+-----+-----+--------+
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|20.69|20.05|19.97|20.87| 5923000|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|20.84|20.52|20.38|21.01| 4937100|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|20.89|20.87|20.55|20.95| 4362100|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|21.01|20.95|20.73| 21.1| 2696800|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|21.04|21.07|20.91|21.21| 3608500|
|2010|    1|     A|Agilent Technolog...| Califor

In [7]:
SA.printSchema()
SA.count()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- symbol: string (nullable = true)
 |-- companyname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sector: string (nullable = true)
 |-- sub_industry: string (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)
 |-- low: double (nullable = true)
 |-- high: double (nullable = true)
 |-- volume: integer (nullable = true)



851259

In [8]:
maxyearInDataset = SA.select(
    greatest(max(col('year')), max(col('month')))).show()

maxyearInDataset = SA.select(
    greatest(max(col('year')), max(col('month')))).head()[0]

+-------------------------------+
|greatest(max(year), max(month))|
+-------------------------------+
|                           2016|
+-------------------------------+



In [9]:
#To check the Growth of eac company to  Add the column growth
SA=SA.withColumn('growth',round(((F.col("open")-F.col("close"))/F.col("open"))*100,2))


In [6]:
SA.createOrReplaceTempView('SA')
SA.show()

+----+-----+------+--------------------+-----------+-----------+--------------------+-----+-----+-----+-----+--------+------+
|year|month|symbol|         companyname|      state|     sector|        sub_industry| open|close|  low| high|  volume|growth|
+----+-----+------+--------------------+-----------+-----------+--------------------+-----+-----+-----+-----+--------+------+
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|20.69|20.05|19.97|20.87| 5923000|  3.09|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|20.84|20.52|20.38|21.01| 4937100|  1.54|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|20.89|20.87|20.55|20.95| 4362100|   0.1|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|21.01|20.95|20.73| 21.1| 2696800|  0.29|
|2010|    1|     A|Agilent Technolog...| California|Health Care|Health Care Equip...|21.04|21.07|20.91|21.21| 3608500|

In [8]:
SA.select('growth').show(5)

+------+
|growth|
+------+
|  3.09|
|  1.54|
|   0.1|
|  0.29|
| -0.14|
+------+
only showing top 5 rows



In [11]:
#Find the top five companies that are good for investment
spark.sql("""
 select companyname,max(growth) as growth from SA group by companyname order by growth desc
         """).show(5)

+--------------------+------+
|         companyname|growth|
+--------------------+------+
|     First Solar Inc| 25.54|
|       Williams Cos.| 25.25|
|Michael Kors Hold...|  24.2|
|   Chesapeake Energy| 20.31|
|Freeport-McMoran ...| 20.19|
+--------------------+------+
only showing top 5 rows



In [20]:
#Show the best-growing industry by each state, having at least two or more industries mapped.
spark.sql("""

 select state,sub_industry,max(growth) as growth from SA
 group by state,sub_industry order by growth desc
 
 """).show(5)

+---------+--------------------+------+
|    state|        sub_industry|growth|
+---------+--------------------+------+
|  Arizona|      Semiconductors| 25.54|
| Oklahoma|Oil & Gas Explora...| 25.25|
| New York|Apparel; Accessor...|  24.2|
| Oklahoma|Integrated Oil & Gas| 20.31|
|  Arizona|              Copper| 20.19|
+---------+--------------------+------+
only showing top 5 rows



In [44]:
#1) Best Year for each Sector
SA.groupBy('sector','year').agg(
    max('growth').alias('max_growth')).orderBy(
        col('max_growth').desc()).show()

+--------------------+----+----------+
|              sector|year|max_growth|
+--------------------+----+----------+
|Information Techn...|2011|     25.54|
|              Energy|2016|     25.25|
|Consumer Discreti...|2015|      24.2|
|           Materials|2016|     20.19|
|         Health Care|2016|     20.01|
|              Energy|2012|      18.9|
|Consumer Discreti...|2016|      18.8|
|         Industrials|2015|     18.73|
|              Energy|2010|      18.7|
|         Health Care|2011|     18.01|
|           Utilities|2015|     17.73|
|    Consumer Staples|2013|     17.36|
|         Health Care|2015|     17.13|
|         Industrials|2014|      17.1|
|Information Techn...|2013|     16.86|
|Information Techn...|2016|     16.81|
|Information Techn...|2012|     15.94|
|         Industrials|2011|     15.08|
|         Industrials|2016|     14.93|
|              Energy|2015|     14.79|
+--------------------+----+----------+
only showing top 20 rows



In [10]:
#Worst year of each Sector
SA.groupBy('sector','year').agg(
    min('growth').alias('min_growth')).orderBy(
        col('min_growth').desc()).show(5)

+--------------------+----+----------+
|              sector|year|min_growth|
+--------------------+----+----------+
|           Utilities|2012|     -5.41|
|Telecommunication...|2015|      -6.0|
|           Utilities|2013|     -6.47|
|           Utilities|2014|     -6.47|
|Telecommunication...|2013|     -6.74|
+--------------------+----+----------+
only showing top 5 rows



In [18]:

SA[col('year') == 2010].select(
    'sector', 'year','month','growth').orderBy(col('month').asc()).show()

+-----------+----+-----+------+
|     sector|year|month|growth|
+-----------+----+-----+------+
|Health Care|2010|    1|  3.09|
|Health Care|2010|    1|  1.54|
|Health Care|2010|    1|   0.1|
|Health Care|2010|    1|  0.29|
|Health Care|2010|    1| -0.14|
|Health Care|2010|    1|  3.87|
|Health Care|2010|    1| -1.43|
|Health Care|2010|    1| -0.69|
|Health Care|2010|    1| -0.32|
|Health Care|2010|    1|  0.37|
|Health Care|2010|    1| -1.83|
|Health Care|2010|    1|  -0.5|
|Health Care|2010|    1| -0.09|
|Health Care|2010|    1|   0.0|
|Health Care|2010|    1|  0.18|
|Health Care|2010|    1|  1.53|
|Health Care|2010|    1|  0.76|
|Health Care|2010|    1|  2.51|
|Health Care|2010|    1|  0.27|
|Industrials|2010|    1|-10.86|
+-----------+----+-----+------+
only showing top 20 rows



In [32]:
#Year wise maximum growth of Companies
spark.sql("""

 select sector,(case when year=2010 then max(growth) else 0 end  )as growth2010,
 (case when year=2011 then max(growth) else 0 end )as growth2011 ,
 (case when year=2012 then max(growth) else 0 end )as growth2012,
 (case when year=2013 then max(growth) else 0 end )as growth2013 ,
 (case when year=2014 then max(growth) else 0 end )as growth2014,
 (case when year=2015 then max(growth) else 0 end )as growth2015,
 (case when year=2016 then max(growth) else 0 end )as growth2016  from SA 
 group by year,sector
 
 """).show()

+--------------------+----------+----------+----------+----------+----------+----------+----------+
|              sector|growth2010|growth2011|growth2012|growth2013|growth2014|growth2015|growth2016|
+--------------------+----------+----------+----------+----------+----------+----------+----------+
|         Industrials|       0.0|       0.0|       0.0|     13.26|       0.0|       0.0|       0.0|
|           Materials|       0.0|       0.0|       0.0|       0.0|      9.34|       0.0|       0.0|
|Telecommunication...|       0.0|       0.0|       0.0|       5.0|       0.0|       0.0|       0.0|
|    Consumer Staples|       0.0|       0.0|       0.0|       0.0|       0.0|       0.0|     14.49|
|Consumer Discreti...|      9.16|       0.0|       0.0|       0.0|       0.0|       0.0|       0.0|
|Information Techn...|       0.0|       0.0|       0.0|       0.0|       0.0|       0.0|     16.81|
|Information Techn...|       0.0|       0.0|       0.0|       0.0|       0.0|     10.54|       0.0|
