In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd

# start spark session and setup minio configuration
spark = SparkSession.builder \
    .appName("MinIO Integration") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.maximum", "50") \
    .config("spark.hadoop.fs.s3a.threads.core", "20") \
    .config("spark.hadoop.fs.s3a.connection.timeout", "5000") \
    .config("spark.hadoop.fs.s3a.retry.limit", "10") \
    .config("spark.hadoop.fs.s3a.attempts.maximum", "10") \
    .config("spark.hadoop.fs.s3a.multipart.size", "104857600") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()


In [2]:
# read AAPL data from minio
df = spark.read.csv("s3a://your-bucket/AAPL.csv", header=True, inferSchema=True)
df.show()

Py4JJavaError: An error occurred while calling o60.csv.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:724)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:551)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:404)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:538)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2592)
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2686)
	... 29 more


In [None]:

# Cast 'Adj Close' and 'Date' to correct types
df = df.withColumn("Adj Close", col("Adj Close").cast("double")) \
       .withColumn("Date", col("Date").cast("date"))

# Define a function to add moving average columns
def add_moving_average(df, window_size):
    window_spec = Window.orderBy("Date").rowsBetween(-window_size + 1, 0)
    return df.withColumn(f"Moving_Avg_{window_size}", avg("Adj Close").over(window_spec))

# Add multiple moving averages (5, 10, 20, 30, 60, and 90 days)
for window in [5, 10, 20, 30, 60, 90]:
    df = add_moving_average(df, window)

print(f"total process data count: {df.count()}")
df.show()

# Convert the PySpark DataFrame back to Pandas for plotting
pd_df = df.select("Date", "Adj Close", "Moving_Avg_5", "Moving_Avg_10", "Moving_Avg_20", 
                  "Moving_Avg_30", "Moving_Avg_60", "Moving_Avg_90").toPandas()


In [None]:

# Plot the data using matplotlib
plt.figure(figsize=(14, 8))
plt.plot(pd_df['Date'], pd_df['Adj Close'], label='Adj Close', color='blue')
plt.plot(pd_df['Date'], pd_df['Moving_Avg_5'], label='5-Day MA', linestyle='--')
plt.plot(pd_df['Date'], pd_df['Moving_Avg_10'], label='10-Day MA', linestyle='--')
plt.plot(pd_df['Date'], pd_df['Moving_Avg_20'], label='20-Day MA', linestyle='--')
plt.plot(pd_df['Date'], pd_df['Moving_Avg_30'], label='30-Day MA', linestyle='--')
plt.plot(pd_df['Date'], pd_df['Moving_Avg_60'], label='60-Day MA', linestyle='--')
plt.plot(pd_df['Date'], pd_df['Moving_Avg_90'], label='90-Day MA', linestyle='--')

# Formatting the plot
plt.title('Stock Price and Moving Averages')
plt.xlabel('Date')
plt.ylabel('Adjusted Close Price')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()