# Processamento

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Stock Price Analysis").getOrCreate()

In [4]:
stocks = spark.read.csv("StockData", header=True)

In [5]:
stocks.show(5)

+------+----------+----------+-------+--------+--------+--------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low|
+------+----------+----------+-------+--------+--------+--------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |
+------+----------+----------+-------+--------+--------+--------+
only showing top 5 rows



In [8]:
stocks.printSchema() # colunas e seus tipos de dados

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [None]:
stocks.select("Ticker").show(3) # select coluna Ticker

In [10]:
stocks.select(["Ticker", "Date", "Open"]).show(5) # select várias colunas do dataframe

+------+----------+--------+
|Ticker|      Date|    Open|
+------+----------+--------+
| BRK-B|05/31/2023|$321.12 |
| BRK-B|05/30/2023|$321.86 |
| BRK-B|05/26/2023|$320.44 |
| BRK-B|05/25/2023|$320.56 |
| BRK-B|05/24/2023|$322.71 |
+------+----------+--------+
only showing top 5 rows



In [13]:
# Filtragem: selecionar linhas contendo stock da Microsoft (MSFT) no mês selecionado
stocks.filter((stocks.Ticker == "MSFT") & (stocks.Date == "05/31/2023")).show(10)

+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 |45950550|$332.29 |$335.94 |$327.33 |
+------+----------+----------+--------+--------+--------+--------+



In [18]:
# selecionar linhas de dois tickers diferentes na data selecionada
# ñ funciona com & no lugar do | pois é como se quiséssemos que o valor fosse MSFT e V ao mesmo tempo,
# o que é impossível
stocks.filter(((stocks.Ticker == "MSFT") | (stocks.Ticker == "V")) & (stocks.Date == "05/31/2023")).show(10)

+------+----+----------+------+----+----+---+
|Ticker|Date|Close/Last|Volume|Open|High|Low|
+------+----+----------+------+----+----+---+
+------+----+----------+------+----+----+---+



In [19]:
# Ticker possui um dos 5 valores e a data selecionada
stocks.filter((stocks.Ticker.isin(["MSFT", "QQQ", "SPY", "V", "TSLA"])) & (stocks.Date == "05/31/2023")).show(10)

+------+----------+----------+---------+--------+--------+--------+
|Ticker|      Date|Close/Last|   Volume|    Open|    High|     Low|
+------+----------+----------+---------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 | 45950550|$332.29 |$335.94 |$327.33 |
|  TSLA|05/31/2023|  $203.93 |150711700|$199.78 |$203.95 |$195.12 |
|     V|05/31/2023|  $221.03 | 20460620|$219.96 |$221.53 |$216.14 |
|   SPY|05/31/2023|    417.85|110811800|  418.28|  419.22|  416.22|
|   QQQ|05/31/2023|    347.99| 65105380|  348.37|   350.6|  346.51|
+------+----------+----------+---------+--------+--------+--------+



In [20]:
# Usando UDFs: User Defined Functions
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
from datetime import datetime

# cria função date_parser que converte as datas de string pra date
# na udf: arg1= a função em si (nesse caso uma lambda), arg2= tipo de dado que ela deve retornar
date_parser = udf(lambda date : datetime.strptime(date, "%m/%d/%Y"), DateType())
# basicamente a função lambda nos devolve valores em datetime, mas como o retorno definido na udf()
# é DateType ela vai converter de datetime pra DateType. Assim, date_parser recebe strings e retorna DateType.

In [23]:
# aplicando a função
# a função lambda é aplicada em cada linha da coluna Date
stocks = stocks.withColumn("ParsedDate", date_parser(stocks.Date))
stocks.show(5)

+------+----------+----------+-------+--------+--------+--------+----------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low|ParsedDate|
+------+----------+----------+-------+--------+--------+--------+----------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |2023-05-31|
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |2023-05-30|
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |2023-05-26|
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |2023-05-25|
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |2023-05-24|
+------+----------+----------+-------+--------+--------+--------+----------+
only showing top 5 rows



In [None]:
stocks.printSchema()

In [29]:
# Função pra remover cifrão $
def num_parser(value):
    if isinstance(value, str):
        return float(value.strip("$"))
    elif isinstance(value, int) or isinstance(value, float):
        return value
    else:
        return None

# teste
print(num_parser("$456.89"))

456


In [34]:
from pyspark.sql.types import FloatType

# é possível transformar em udf uma função puramente pythônica
number_parser = udf(num_parser, FloatType())

stocks = (stocks.withColumn("Open", number_parser(stocks.Open))
                .withColumn("Close", number_parser(stocks["Close/Last"]))
                .withColumn("High", number_parser(stocks.High))
                .withColumn("Low", number_parser(stocks.Low)))

# Close/Last foi selecionada de forma diferente pois a sintaxe ñ deixa dar stocks.Close/Last

stocks.show(10)

+------+----------+----------+-------+------+------+------+----------+------+
|Ticker|      Date|Close/Last| Volume|  Open|  High|   Low|ParsedDate| Close|
+------+----------+----------+-------+------+------+------+----------+------+
| BRK-B|05/31/2023|  $321.08 |6175417|321.12|322.41|319.39|2023-05-31|321.08|
| BRK-B|05/30/2023|  $322.19 |3232461|321.86|322.47| 319.0|2023-05-30|322.19|
| BRK-B|05/26/2023|  $320.60 |3229873|320.44|322.63|319.67|2023-05-26| 320.6|
| BRK-B|05/25/2023|  $319.02 |4251935|320.56|320.56|317.71|2023-05-25|319.02|
| BRK-B|05/24/2023|  $320.20 |3075393|322.71| 323.0|319.56|2023-05-24| 320.2|
| BRK-B|05/23/2023|  $323.11 |4031342|328.19|329.27|322.97|2023-05-23|323.11|
| BRK-B|05/22/2023|  $329.13 |2763422|330.75|331.49|328.35|2023-05-22|329.13|
| BRK-B|05/19/2023|  $330.39 |4323538| 331.0|333.94|329.12|2023-05-19|330.39|
| BRK-B|05/18/2023|  $329.76 |2808329|326.87|329.98|325.85|2023-05-18|329.76|
| BRK-B|05/17/2023|  $327.39 |3047626|325.02|328.26|324.82|2023-

In [37]:
stocks.printSchema()

from pyspark.sql.types import IntegerType

# UDF pra converter Volume pra inteiro
integer_parser = udf(lambda num : int(num), IntegerType())

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- ParsedDate: date (nullable = true)
 |-- Close: float (nullable = true)



In [39]:
# convertendo pra int
stocks = stocks.withColumn("Volume", integer_parser(stocks.Volume))
stocks.printSchema(5)

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- ParsedDate: date (nullable = true)
 |-- Close: float (nullable = true)



In [40]:
# removendo colunas Date e Close/Last
stocks = stocks.select(["Ticker", "ParsedDate", "Close", "Volume", "Open", "High", "Low"])
stocks.show(5)

+------+----------+------+-------+------+------+------+
|Ticker|ParsedDate| Close| Volume|  Open|  High|   Low|
+------+----------+------+-------+------+------+------+
| BRK-B|2023-05-31|321.08|6175417|321.12|322.41|319.39|
| BRK-B|2023-05-30|322.19|3232461|321.86|322.47| 319.0|
| BRK-B|2023-05-26| 320.6|3229873|320.44|322.63|319.67|
| BRK-B|2023-05-25|319.02|4251935|320.56|320.56|317.71|
| BRK-B|2023-05-24| 320.2|3075393|322.71| 323.0|319.56|
+------+----------+------+-------+------+------+------+
only showing top 5 rows



In [42]:
# Estatísticas básicas sobre os dados
stocks.describe(["Close", "Volume", "Open", "High", "Low"]).show()

+-------+------------------+--------------------+------------------+------------------+------------------+
|summary|             Close|              Volume|              Open|              High|               Low|
+-------+------------------+--------------------+------------------+------------------+------------------+
|  count|             15108|               15108|             15108|             15108|             15108|
|   mean| 180.1256089860054|5.1868408793685466E7|180.09656566181036| 182.1253348687101| 177.9982781513109|
| stddev|101.14891782168563| 5.496484129953464E7|101.16125813324399|101.96625521621753|100.26590135955216|
|    min|             11.93|              961133|             12.07|             12.45|              11.8|
|    max|            477.71|           914080943|            479.22|            479.98|            476.06|
+-------+------------------+--------------------+------------------+------------------+------------------+



# Análise

#### Estrutura dos Dados

- *Ticker:* abreviação usada na bolsa para representa uma empresa
- *Date:* data de registro dos dados, cada dia é um dia de negociação na bolsa
- *Close:* último preço da ação no final do dia de negociação (é o valor que vemos falar quando dizem que "A ação da empresa X fechou hoje em tanto")
- *Volume:* quantidade de ações que foram compradas e vendidas no dia
- *Open:* primeiro preço da ação na abertura do mercado daquele dia
- *High:* maior preço atingido pela ação no dia
- *Low:* menor preço atingido pela ação no dia

In [None]:
# Calcular preço máximo de estoque entre vários estoques
stocks.