In [1]:
import datetime as dt
import pandas as pd
#!pip install yfinance
import yfinance as yf

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import to_date
from pyspark.ml.evaluation import RegressionEvaluator


In [3]:
# SparkSession başlatma
spark = SparkSession.builder.appName("NVDA Price Prediction").getOrCreate()

24/03/25 12:50:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Veriyi yükleme ve DataFrame oluşturma
df = spark.read.csv("file:///home/hduser/Desktop/NVDA/NVDA_histrical_data.csv", header=True, inferSchema=True)

# 'Date' sütununu tarih tipine dönüştürme (gerekirse)
df = df.withColumn("Date", to_date(df["Date"]))

In [5]:
df.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.39583298563957214|0.36309847235679626| 6086400|
|1999-03-25| 0.3945310115814209|0.41666701436042786|0.39322900772094727|0.40104201436042786|0.36787667870521545| 4032000|
|1999-03-26|            0.40625|             0.4375|            0.40625|0.43619799613952637| 0.4001253545284271| 8827200|
+----------+------------

In [6]:
spark.createDataFrame(df.tail(5)).show()

+----------+------------------+------------------+------------------+------------------+------------------+--------+
|      Date|              Open|              High|               Low|             Close|         Adj Close|  Volume|
+----------+------------------+------------------+------------------+------------------+------------------+--------+
|2023-03-15|237.61000061035156|242.86000061035156|233.60000610351562|242.27999877929688|242.20230102539062|52448600|
|2023-03-16|240.27000427246094| 255.8800048828125|238.94000244140625|255.41000366210938|255.32810974121094|58325300|
|2023-03-17|259.82000732421875|  263.989990234375|256.67999267578125|            257.25|257.16754150390625|84854700|
|2023-03-20| 256.1499938964844|  260.239990234375| 251.3000030517578|             259.0| 258.9169616699219|43274700|
|2023-03-21|261.79998779296875| 263.9200134277344|253.80999755859375|  261.989990234375| 261.9059753417969|54740800|
+----------+------------------+------------------+--------------

In [7]:
from pyspark.sql.functions import col, count, when

# Her bir sütun için eksik değer sayısını hesaplama
missing_values = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
missing_values.show()

+----+----+----+---+-----+---------+------+
|Date|Open|High|Low|Close|Adj Close|Volume|
+----+----+----+---+-----+---------+------+
|   0|   0|   0|  0|    0|        0|     0|
+----+----+----+---+-----+---------+------+



In [9]:
from pyspark.ml.feature import StandardScaler


In [10]:

# Özellikleri bir vektör haline getirme
assembler = VectorAssembler(inputCols=['Open', 'High', 'Low', 'Volume'], outputCol="features")
df_assembled = assembler.transform(df)

# Ölçeklendirme için StandardScaler kullanma
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(df_assembled)
df_scaled = scalerModel.transform(df_assembled)

df_scaled.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+--------------------+--------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|            features|      scaledFeatures|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+--------------------+--------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|[0.44661501049995...|[-0.5119521473795...|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|[0.42708298563957...|[-0.5122838808479...|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.39583298563957214|0.36309847235679626| 6086400|[0.39583298563957...|[-0.5128146333409...|
|1999-03-25| 0.3

# ozellik muhendisligi

In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

windowSpec = Window.orderBy('Date')
df = df.withColumn('Prev_Close', lag(df['Close']).over(windowSpec))
df.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|0.42447900772094727|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.39583298563957214|0.36309847235679626| 6086400| 0.3984380066394806|
|1999-03-25| 0.3945310115814209|0.41666701436042786|0.39322900772094727|0.40104201436042786|0.36787667870521545| 4032000|0.39583298563957214|
|1999-

24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [12]:
from pyspark.sql.functions import avg

short_window = Window.orderBy('Date').rowsBetween(-30, 0)
long_window = Window.orderBy('Date').rowsBetween(-90, 0)

df = df.withColumn('Short_Average', avg('Close').over(short_window))
df = df.withColumn('Long_Average', avg('Close').over(long_window))
df.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|      Short_Average|       Long_Average|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|0.42447900772094727|0.42447900772094727|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|0.42447900772094727|0.41145850718021393|0.41145850718021393|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.3958329856395721

24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [13]:
from pyspark.sql.functions import col

df = df.withColumn('Day_Pct_Change', (col('Close') - col('Prev_Close')) / col('Prev_Close'))
df.show(5)

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|      Short_Average|       Long_Average|      Day_Pct_Change|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|0.42447900772094727|0.42447900772094727|                null|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|0.42447900772094727|0.41145850718021393|0.41145850718021393|-0.0613

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [14]:
from pyspark.sql.functions import log

df = df.withColumn('Log_Volume', log(col('Volume')))
df.show(5)

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|      Short_Average|       Long_Average|      Day_Pct_Change|        Log_Volume|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|0.42447900772094727|0.42447900772094727|                null|15.114938986062503|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|

In [15]:
# Specifying properties and target variable
feature_columns = ['Open', 'High', 'Low', 'Volume']  # Örnek özellikler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df)

In [16]:
# Target variable
data = data.withColumn("label", data["Close"])

In [17]:
# Separate training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [18]:
# Model creation and training
lr = LinearRegression(featuresCol='features', labelCol='label')
model = lr.fit(train_data)

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 1

In [19]:
# Prediction on the test set
predictions = model.transform(test_data)

In [20]:
# Evaluating performance
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

Root Mean Squared Error (RMSE) on test data: 0.8952458650775283


24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
