In [None]:
!pip install pandas numpy pyspark findspark boto3

In [None]:
#Importing all necessary libraries

import pandas, numpy, os
import pyspark
from pyspark import SparkConf, SparkContext
import boto3
import findspark
findspark.init()

In [2]:
!pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.0.3
      /_/
                        
Using Scala version 2.12.10, Java HotSpot(TM) 64-Bit Server VM, 1.8.0_321
Branch HEAD
Compiled by user ubuntu on 2021-06-17T04:08:22Z
Revision 65ac1e75dc468f53fc778cd2ce1ba3f21067aab8
Url https://github.com/apache/spark
Type --help for more information.


In [3]:
#Creating a Spark Session

conf = SparkConf()\
    .setAppName("Purush_ETL")\
    .setMaster("local[2]")

spark = SparkSession\
    .builder\
    .config(conf = conf)\
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "5")\
    .config("spark.driver.memory", "2g")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.minExecutors","1")\
    .config("spark.dynamicAllocation.maxExecutors","3")\
    .enableHiveSupport()\
    .getOrCreate()

NameError: name 'SparkConf' is not defined

In [None]:
pathforDWH = ""
s3_bucket_path = ""

In [None]:
# Configure AWS credentials
aws_access_key_id = ""
aws_secret_access_key = ""

# Set up the AWS session
session = boto3.Session(
    aws_access_key_id = aws_access_key_id,
    aws_secret_access_key = aws_secret_access_key
)

## Reading the data from S3

### Reading the Metadata

In [None]:
sym_meta = spark.read.options("delimiter": ",").csv("symbol_metadata.csv")

sym_meta.printSchema() #to check the Schema of the dataframe
#sym_meta.show()

In [None]:
## Before reading the Stock data, we need to make sure we read the Stock data  
## only for those companies/symbols that are listed in the Metadata file.
sym_list = [x for x in sym_meta["Symbol"]]

## Reading the Stock data

In [None]:
stock_data = spark.read.options("delimiter": ",").csv([s3_bucket_path+"/"+ x for x in sym_list])

stock_data.printSchema() #to check the Schema of the dataframe
#stock_data.show()

stock_data.withColumn("Symbol", input_file_name()).repartition(col("Symbol"))

In [None]:
stock_data_full = stock_data.join(sym_meta, stock_data["Symbol"] == sym_meta["Symbol"], "left")\
    .select(stock_data['*'], 
            sym_meta["Symbol"],
            sym_meta["Name"],
            sym_meta["Country"],
            sym_meta["Sector"],
            sym_meta["Industry"])

stock_data_full.printSchema()

### Summary Report (All Time)

In [None]:
def summary_report_all_func(stock_data_full, industries):
    summary_report_output__all_time = stock_data_full.filter(stock_data_full["Sector"].isin(industries)) \
        .groupBy(stock_data_full["Sector"])\
        .agg(
            avg(stock_data_full["open"]).alias("Avg Open Price"), \
            avg(stock_data_full["close"]).alias("Avg Close Price"), \
            max(stock_data_full["high"]).alias("Max High Price"), \
            min(stock_data_full["low"]).alias("Min Low Price"), \
            avg(stock_data_full["volume"]).alias("Avg Volume") \
        )
    
    summary_report_output__all_time.printSchema()

    #For developmental purposes I'm trying to store the data in a Dataframe and then return it; rather than directly returning it.
    return summary_report_output__all_time

In [None]:
no_industries = int(input("Enter the number of industries for which you wanted Summary Report (All Time):"))
arr = input()   # takes the whole line of no_industries strings
industries = list(arr.split(',')) # split those strings with ','

summary_report_all_func(stock_data_full, industries).show(n = len(sym_list))


### Summary Report (Period)

In [None]:
def summary_report_period_func(stock_data_full, sectors, start_date, end_date):
    summary_report_output__period = stock_data_full \
        .filter(stock_data_full["Sector"].isin(sectors), 
                to_date(stock_data_full["timestamp"], "YYYY-MM-DD").between(start_date, end_date)
        ) \
        .groupBy(stock_data_full["Sector"]) \
        .agg(
            avg(stock_data_full["open"]).alias("Avg Open Price"), \
            avg(stock_data_full["close"]).alias("Avg Close Price"), \
            max(stock_data_full["high"]).alias("Max High Price"), \
            min(stock_data_full["low"]).alias("Min Low Price"), \
            avg(stock_data_full["volume"]).alias("Avg Volume") \
        )

    #For developmental purposes I'm trying to store the data in a Dataframe and then return it; rather than directly returning it.
    return summary_report_output__period

In [None]:
start_date = to_date(input("Enter the start date of the period for Summary Report [YYYY-MM-DD]:"), "YYYY-MM-DD")
end_date = to_date(input("Enter the end date of the period for Summary Report [YYYY-MM-DD]:"), "YYYY-MM-DD")
no_sectors = int(input("Enter the number of sectors for which you wanted Summary Report (Given Period):"))
arr = input()   # takes the whole line of no_sectors strings
sectors = list(arr.split(',')) # split those strings with ','

summary_report_period_func(stock_data_full, sectors, start_date, end_date).show(n = len(sym_list))

### Detailed Reports (Period)

In [None]:
def detailed_report_period_func(stock_data_full, sectors, start_date, end_date):
    detailed_report_output__period = stock_data_full \
        .filter(stock_data_full["Sector"].isin(sectors), 
                to_date(stock_data_full["timestamp"], "YYYY-MM-DD").between(start_date, end_date)
        ) \
        .groupBy(stock_data_full["Symbol"], stock_data_full["Name"]) \
        .agg(
            avg(stock_data_full["open"]).alias("Avg Open Price"), \
            avg(stock_data_full["close"]).alias("Avg Close Price"), \
            max(stock_data_full["high"]).alias("Max High Price"), \
            min(stock_data_full["low"]).alias("Min Low Price"), \
            avg(stock_data_full["volume"]).alias("Avg Volume") \
        )

    #For developmental purposes I'm trying to store the data in a Dataframe and then return it; rather than directly returning it.
    detailed_report_output__period.printSchema()

In [None]:
start_date = to_date(input("Enter the start date of the period for Detailed Reports [YYYY-MM-DD]:"), "YYYY-MM-DD")
end_date = to_date(input("Enter the end date of the period for Detailed Reports [YYYY-MM-DD]:"), "YYYY-MM-DD")
no_sectors = int(input("Enter the number of sectors for which you wanted Detailed Report (Given Period):"))
arr = input()   # takes the whole line of no_sectors strings
sectors = list(arr.split(',')) # split those strings with ','

detailed_report_output__period(stock_data_full, sectors, start_date, end_date).show(n = len(sym_list))

In [None]:
df.repartion('Month').write.mode("overwrite").format("parquet").path(pathforDL)

In [None]:
spark.stop()