In [1]:
import datetime as dt
import pandas as pd
#!pip install yfinance
import yfinance as yf

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import to_date
from pyspark.ml.evaluation import RegressionEvaluator


In [3]:
# SparkSession başlatma
spark = SparkSession.builder.appName("NVDA Price Prediction").getOrCreate()

24/03/26 22:14:10 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Veriyi yükleme ve DataFrame oluşturma
data = spark.read.csv("file:///home/hduser/Desktop/NVDA/NVDA_histrical_data.csv", header=True, inferSchema=True)

# 'Date' sütununu tarih tipine dönüştürme 
data = data.withColumn("Date", to_date(data["Date"]))

In [5]:
data.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.39583298563957214|0.36309847235679626| 6086400|
|1999-03-25| 0.3945310115814209|0.41666701436042786|0.39322900772094727|0.40104201436042786|0.36787667870521545| 4032000|
|1999-03-26|            0.40625|             0.4375|            0.40625|0.43619799613952637| 0.4001253545284271| 8827200|
+----------+------------

In [6]:
spark.createDataFrame(data.tail(5)).show()

+----------+------------------+------------------+------------------+------------------+------------------+--------+
|      Date|              Open|              High|               Low|             Close|         Adj Close|  Volume|
+----------+------------------+------------------+------------------+------------------+------------------+--------+
|2023-03-15|237.61000061035156|242.86000061035156|233.60000610351562|242.27999877929688|242.20230102539062|52448600|
|2023-03-16|240.27000427246094| 255.8800048828125|238.94000244140625|255.41000366210938|255.32810974121094|58325300|
|2023-03-17|259.82000732421875|  263.989990234375|256.67999267578125|            257.25|257.16754150390625|84854700|
|2023-03-20| 256.1499938964844|  260.239990234375| 251.3000030517578|             259.0| 258.9169616699219|43274700|
|2023-03-21|261.79998779296875| 263.9200134277344|253.80999755859375|  261.989990234375| 261.9059753417969|54740800|
+----------+------------------+------------------+--------------

In [7]:
# 'Close' dışındaki tüm sütunları at
df = data.select("Date", "Close")

# İlk 5 satırı göstermek için
df.show(5)


+----------+-------------------+
|      Date|              Close|
+----------+-------------------+
|1999-03-22|0.42447900772094727|
|1999-03-23| 0.3984380066394806|
|1999-03-24|0.39583298563957214|
|1999-03-25|0.40104201436042786|
|1999-03-26|0.43619799613952637|
+----------+-------------------+
only showing top 5 rows



In [8]:
# 'Close' sütunu ve 'Date' sütunu ile sınırlı DataFrame'i CSV olarak kaydet
#df.write.csv("file:///home/hduser/Desktop/NVDA/NVDA.csv", header=True)


In [9]:
from pyspark.sql.functions import col, count, when

# Her bir sütun için eksik değer sayısını hesaplama
missing_values = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
missing_values.show()

+----+-----+
|Date|Close|
+----+-----+
|   0|    0|
+----+-----+



1. Veri Setini Yükleme ve İlk İşleme

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date

# Spark session başlatma
spark = SparkSession.builder.appName("Time Series Forecasting").getOrCreate()

# Veri setinin, 'Date' sütununu tarih tipine çevirme
df = df.withColumn("Date", to_date(df["Date"], "yyyy-MM-dd"))


24/03/26 22:49:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


2. Öznitelik Mühendisliği ve Veri Setini Bölme

In [26]:
from pyspark.sql.functions import lag
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler

# Window tanımı yapma
windowSpec = Window.orderBy("Date")

# Geçmiş kapanış fiyatlarından öznitelikler oluşturma (lag features)
for i in range(1, 6):
    df = df.withColumn(f"lag_{i}", lag("Close", i).over(windowSpec))

# Null değerleri kaldırma
df = df.na.drop()

# Öznitelikleri bir vektör haline getirme
vectorAssembler = VectorAssembler(inputCols=[f"lag_{i}" for i in range(1, 6)], outputCol="features")

# Veri setini öznitelikler ve etiket (label) olarak ayarlama
df = vectorAssembler.transform(df)


3. Model Eğitimi ve Değerlendirme

In [27]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Modeli tanımlama ve eğitim veri seti üzerinde eğitme
lr = LinearRegression(featuresCol="features", labelCol="Close")
model = lr.fit(df)

# Eğitim veri setindeki tahminleri hesaplama
predictions = model.transform(df)

# MSE ve RMSE değerlerini hesaplama
evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="rmse")
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print(f"MSE: {mse}, RMSE: {rmse}")


24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

MSE: 5.197889861262216, RMSE: 2.2798881247250304


24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:49:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

4. Geleceğe Yönelik Tahminlerin Yapılması

Bu adım, mevcut veri setinizin sonundaki tarihler için 22-26 Mart tarihleri arasında 5 gün ekleyerek geleceğe yönelik tahminler yapılmasını içerir. Eğer bu tarihler veri setinizin dışındaysa, bu tahminleri yapabilmek için ekstra öznitelikler oluşturmanız gerekecektir.

In [28]:
from datetime import datetime, timedelta
import pandas as pd

# En son tarihi al ve geleceğe dönük tarihler oluştur
last_date = df.select("Date").rdd.max()[0]
future_dates = [last_date + timedelta(days=x) for x in range(1, 6)]
future_df = pd.DataFrame(future_dates, columns=["Date"])

# Spark DataFrame'e dönüştür ve öznitelikler oluştur
future_sdf = spark.createDataFrame(future_df)
# Burada geleceğe yönelik öznitelikler oluşturulacak ve tahminler yapılacak


24/03/26 22:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

In [29]:
future_sdf.show()

+----------+
|      Date|
+----------+
|2023-03-22|
|2023-03-23|
|2023-03-24|
|2023-03-25|
|2023-03-26|
+----------+



In [32]:
future_predictions.show()

+----------+----------------+-----+------+------------------+------------------+--------------------+-----------------+
|      Date|           lag_1|lag_2| lag_3|             lag_4|             lag_5|            features|       prediction|
+----------+----------------+-----+------+------------------+------------------+--------------------+-----------------+
|2023-03-22|261.989990234375|259.0|257.25|255.41000366210938|242.27999877929688|[261.989990234375...|261.7372108475563|
|2023-03-23|261.989990234375|259.0|257.25|255.41000366210938|242.27999877929688|[261.989990234375...|261.7372108475563|
|2023-03-24|261.989990234375|259.0|257.25|255.41000366210938|242.27999877929688|[261.989990234375...|261.7372108475563|
|2023-03-25|261.989990234375|259.0|257.25|255.41000366210938|242.27999877929688|[261.989990234375...|261.7372108475563|
|2023-03-26|261.989990234375|259.0|257.25|255.41000366210938|242.27999877929688|[261.989990234375...|261.7372108475563|
+----------+----------------+-----+-----

In [31]:
# Önceki adımlarda oluşturduğunuz modeli kullanarak tahmin yapma
# 'future_sdf' içindeki gecikmeli özniteliklerin adları: 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5'
# Bu özniteliklerin nasıl hesaplanacağı modelinize ve veri setinize bağlıdır

# Örneğin, son 5 günün kapanış fiyatlarını kullanarak bu öznitelikleri oluşturun:
# Bu örnekte, 'df' mevcut veri setinizdir ve son günün kapanış fiyatlarına sahip olduğunuzu varsayıyoruz

from pyspark.sql import functions as F

# 'future_sdf' için gerekli öznitelikleri oluşturma
for i in range(1, 6):
    # Son günün kapanış fiyatını 'lag_1' olarak kullanarak geriye doğru ilerleyin
    # Burada veri setinizden uygun bir şekilde öznitelikleri doldurmanız gerekmekte
    future_sdf = future_sdf.withColumn(f"lag_{i}", F.lit(df.select(F.col("Close")).collect()[-i][0]))

# Öznitelik vektörünü oluşturun
future_sdf = vectorAssembler.transform(future_sdf)

# Tahmin yapma
future_predictions = model.transform(future_sdf)

# Tahmin edilen değerleri gösterme
future_predictions.select("Date", "prediction").show()


24/03/26 22:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

+----------+-----------------+
|      Date|       prediction|
+----------+-----------------+
|2023-03-22|261.7372108475563|
|2023-03-23|261.7372108475563|
|2023-03-24|261.7372108475563|
|2023-03-25|261.7372108475563|
|2023-03-26|261.7372108475563|
+----------+-----------------+



In [33]:
# En son gerçek veriyi veya önceki tahmini kullanarak lag özniteliklerini güncelleme işlevi
def update_lag_features(last_values, new_prediction):
    # Son tahmin değerini ilk gecikme olarak ekleyin ve diğerlerini bir adım öteleme
    updated_values = [new_prediction] + last_values[:-1]
    return updated_values

# İlk gecikme değerleri olarak mevcut bilinen son kapanış fiyatlarını kullanma
last_known_values = [df.select(F.col("Close")).collect()[-i][0] for i in range(1, 6)]

# Gelecekteki her bir tarih için tahminler yapma
for future_date in future_dates:
    # Yeni lag özniteliklerini oluşturma
    new_row = [future_date] + last_known_values
    new_sdf = spark.createDataFrame([new_row], schema=future_sdf.schema)
    
    # Öznitelik vektörünü oluştur
    new_sdf = vectorAssembler.transform(new_sdf)
    
    # Model ile tahmin yapma
    new_prediction = model.transform(new_sdf).select("prediction").collect()[0][0]
    
    # Tahminleri saklama ve sonraki adım için lag özniteliklerini güncelleme
    last_known_values = update_lag_features(last_known_values, new_prediction)

# Son tahminleri içeren DataFrame'i gösterme


24/03/26 22:57:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:57:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:57:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:57:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:57:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:57:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

ValueError: Length of object (6) does not match with length of fields (7)

In [34]:
from pyspark.sql import Row

# Gelecekteki her bir tarih için tahmin yapmak için döngü
for i in range(1, 6):
    # Yeni tahmin için gerekli gecikmeleri hazırla
    # Burada son bilinen gerçek değerleri kullanıyorsunuz
    lag_values = [float(df.select(F.col("Close")).collect()[-j][0]) for j in range(i, i+5)]
    future_date = last_date + timedelta(days=i)
    
    # Yeni bir satır oluştur. Features sütunu daha sonra doldurulacak
    new_row = Row(Date=future_date, lag_1=lag_values[0], lag_2=lag_values[1], 
                  lag_3=lag_values[2], lag_4=lag_values[3], lag_5=lag_values[4])
    
    # Spark DataFrame'ine dönüştür ve öznitelik vektörünü oluştur
    new_sdf = spark.createDataFrame([new_row])
    new_sdf = vectorAssembler.transform(new_sdf)
    
    # Tahmin yap
    new_prediction = model.transform(new_sdf).select("prediction").collect()[0][0]
    
    # İleri tahminler için yeni gecikmeli değerler oluştur
    lag_values.pop(0) # En eski lag değerini çıkar
    lag_values.append(new_prediction) # Yeni tahmini ekle
    
    # Tahmini kaydet
    future_predictions.append((future_date, new_prediction))


24/03/26 22:59:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:59:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:59:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:59:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:59:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:59:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

AttributeError: 'DataFrame' object has no attribute 'append'

In [35]:
# Gelecekteki tahminleri saklayacak bir liste oluştur
future_predictions_list = []

# Gelecekteki her bir tarih için tahmin yapmak için döngü
for i in range(1, 6):
    # Yeni tahmin için gerekli gecikmeleri hazırla
    # Burada son bilinen gerçek değerleri kullanıyorsunuz
    lag_values = [float(df.select(F.col("Close")).collect()[-j][0]) for j in range(i, i+5)]
    future_date = last_date + timedelta(days=i)
    
    # Yeni bir satır oluştur. Features sütunu daha sonra doldurulacak
    new_row = Row(Date=future_date, lag_1=lag_values[0], lag_2=lag_values[1], 
                  lag_3=lag_values[2], lag_4=lag_values[3], lag_5=lag_values[4])
    
    # Spark DataFrame'ine dönüştür ve öznitelik vektörünü oluştur
    new_sdf = spark.createDataFrame([new_row])
    new_sdf = vectorAssembler.transform(new_sdf)
    
    # Tahmin yap
    new_prediction = model.transform(new_sdf).select("prediction").collect()[0][0]
    
    # İleri tahminler için yeni gecikmeli değerler oluştur
    lag_values.pop(0) # En eski lag değerini çıkar
    lag_values.append(new_prediction) # Yeni tahmini ekle
    
    # Tahmini ve tarihi listeye ekle
    future_predictions_list.append((future_date, new_prediction))

# Listeyi DataFrame'e dönüştür
future_predictions_df = spark.createDataFrame(future_predictions_list, ["Date", "Prediction"])

# Son tahminleri göster
future_predictions_df.show()


24/03/26 23:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 23:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 23:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 23:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 23:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 23:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 2

+----------+------------------+
|      Date|        Prediction|
+----------+------------------+
|2023-03-22| 261.7372108475563|
|2023-03-23|259.15790149520495|
|2023-03-24| 256.5088494127346|
|2023-03-25|255.16916597172144|
|2023-03-26|241.96367562703267|
+----------+------------------+



Adım 1: Öznitelik Mühendisliği
Veri setinizi Spark DataFrame'e dönüştürün.
lag fonksiyonunu kullanarak, her bir tarih için önceki günlerin kapanış fiyatlarını içeren yeni sütunlar ekleyin. Bu sütunlar, modelin öğrenme sürecinde kullanılacak öznitelikler olacak.

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Time Series Forecasting").getOrCreate()

In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, col

window = Window.orderBy("Date")
for i in range(1, 6):  # 5 günlük gecikmeler
    df = df.withColumn(f"lag_{i}", lag("Close", i).over(window))
df.show(5)

Adım 2: Model Eğitimi
Öznitelikler hazır olduğunda, bir makine öğrenimi modeli seçin ve eğitin. Örneğin, RandomForestRegressor kullanabilirsiniz.

In [16]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=[f"lag_{i}" for i in range(1, 6)], outputCol="features")
rf = RandomForestRegressor(featuresCol="features", labelCol="Close")

pipeline = Pipeline(stages=[assembler, rf])

# Eğitim veri setinde null değerler içeren satırlar kaldırılmalıdır
df = df.na.drop()
model = pipeline.fit(df)


24/03/26 22:30:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:30:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:30:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:30:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:30:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Adım 3: Tahmin
Modeli eğittikten sonra, gelecek değerleri tahmin etmek için kullanabilirsiniz.

In [21]:
predictions = model.transform(df)
predictions.select("Date", "Close", "prediction").show(5)

+----------+-------------------+------------------+
|      Date|              Close|        prediction|
+----------+-------------------+------------------+
|1999-03-29| 0.4492189884185791|1.4970622882490372|
|1999-03-30|0.41145798563957214|1.4970622882490372|
|1999-03-31|0.44010400772094727|1.4970622882490372|
|1999-04-01|0.42708298563957214|1.4970622882490372|
|1999-04-05| 0.4140630066394806|1.4970622882490372|
+----------+-------------------+------------------+
only showing top 5 rows



24/03/26 22:32:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [20]:
spark.createDataFrame(predictions.select("Date", "Close", "prediction").tail(5)).show()

24/03/26 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/26 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+------------------+
|      Date|             Close|        prediction|
+----------+------------------+------------------+
|2023-03-15|242.27999877929688|254.12904826549556|
|2023-03-16|255.41000366210938|254.12904826549556|
|2023-03-17|            257.25|254.12904826549556|
|2023-03-20|             259.0|254.12904826549556|
|2023-03-21|  261.989990234375|254.12904826549556|
+----------+------------------+------------------+



In [23]:
from datetime import datetime, timedelta
from pyspark.sql import SparkSession

# Spark session başlatma
spark = SparkSession.builder.appName("Future Dates").getOrCreate()

# Veri setinizin son tarihini string olarak alın
last_date_str = '2023-03-21'
last_date = datetime.strptime(last_date_str, "%Y-%m-%d")

# Gelecekteki 5 gün için tarih listesi oluştur
future_dates = [last_date + timedelta(days=x) for x in range(1, 6)]

# Gelecekteki tarihler için pandas DataFrame oluştur
future_df = pd.DataFrame(future_dates, columns=['Date'])

# Pandas DataFrame'ini Spark DataFrame'ine dönüştür
future_sdf = spark.createDataFrame(future_df)

# Sonuçları göster
future_sdf.show()


+-------------------+
|               Date|
+-------------------+
|2023-03-22 00:00:00|
|2023-03-23 00:00:00|
|2023-03-24 00:00:00|
|2023-03-25 00:00:00|
|2023-03-26 00:00:00|
+-------------------+



In [24]:
from pyspark.ml.regression import LinearRegression

# Öznitelikleri hazırlama ve vektör haline getirme
vectorAssembler = VectorAssembler(inputCols=["lag_1", "lag_2", "lag_3", "lag_4", "lag_5"], outputCol="features")

# Lineer regresyon modelini tanımlama
lr = LinearRegression(featuresCol="features", labelCol="Close")

# Pipeline oluşturma
pipeline = Pipeline(stages=[vectorAssembler, lr])

# Modeli eğitme
model = pipeline.fit(train_data)

# Son 5 günlük tahminler için özniteliklerin oluşturulması
# Bu kısım, mevcut verinizin son gününden itibaren ileri tarihler için yapılmalıdır
# Örneğin, son gün 2023-03-21 ise, 2023-03-22 için öznitelikler oluşturup model.transform kullanarak tahmin yapmalısınız

# Not: Bu örnekte, 'future_dates' adında ileri tarihleri içeren bir DataFrame'iniz olduğunu varsayıyoruz
# Bu DataFrame, tahmin etmek istediğiniz tarihler için gerekli öznitelikleri (gecikmeleri vb.) içermelidir

# Tahmin yapma
predictions = model.transform(future_sdf)
predictions.select("Date", "prediction").show()


NameError: name 'train_data' is not defined

In [9]:
from pyspark.ml.feature import StandardScaler


In [10]:

# Özellikleri bir vektör haline getirme
assembler = VectorAssembler(inputCols=['Open', 'High', 'Low', 'Volume'], outputCol="features")
df_assembled = assembler.transform(df)

# Ölçeklendirme için StandardScaler kullanma
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(df_assembled)
df_scaled = scalerModel.transform(df_assembled)

df_scaled.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+--------------------+--------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|            features|      scaledFeatures|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+--------------------+--------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|[0.44661501049995...|[-0.5119521473795...|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|[0.42708298563957...|[-0.5122838808479...|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.39583298563957214|0.36309847235679626| 6086400|[0.39583298563957...|[-0.5128146333409...|
|1999-03-25| 0.3

# ozellik muhendisligi

In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

windowSpec = Window.orderBy('Date')
df = df.withColumn('Prev_Close', lag(df['Close']).over(windowSpec))
df.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|0.42447900772094727|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.39583298563957214|0.36309847235679626| 6086400| 0.3984380066394806|
|1999-03-25| 0.3945310115814209|0.41666701436042786|0.39322900772094727|0.40104201436042786|0.36787667870521545| 4032000|0.39583298563957214|
|1999-

24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [12]:
from pyspark.sql.functions import avg

short_window = Window.orderBy('Date').rowsBetween(-30, 0)
long_window = Window.orderBy('Date').rowsBetween(-90, 0)

df = df.withColumn('Short_Average', avg('Close').over(short_window))
df = df.withColumn('Long_Average', avg('Close').over(long_window))
df.show(5)

+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|      Short_Average|       Long_Average|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|0.42447900772094727|0.42447900772094727|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|0.42447900772094727|0.41145850718021393|0.41145850718021393|
|1999-03-24|0.39583298563957214| 0.3984380066394806|0.38020798563957214|0.3958329856395721

24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [13]:
from pyspark.sql.functions import col

df = df.withColumn('Day_Pct_Change', (col('Close') - col('Prev_Close')) / col('Prev_Close'))
df.show(5)

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|      Short_Average|       Long_Average|      Day_Pct_Change|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|0.42447900772094727|0.42447900772094727|                null|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|16396800|0.42447900772094727|0.41145850718021393|0.41145850718021393|-0.0613

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [14]:
from pyspark.sql.functions import log

df = df.withColumn('Log_Volume', log(col('Volume')))
df.show(5)

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+------------------+
|      Date|               Open|               High|                Low|              Close|          Adj Close|  Volume|         Prev_Close|      Short_Average|       Long_Average|      Day_Pct_Change|        Log_Volume|
+----------+-------------------+-------------------+-------------------+-------------------+-------------------+--------+-------------------+-------------------+-------------------+--------------------+------------------+
|1999-03-22| 0.4466150104999542|0.44791701436042786|0.42447900772094727|0.42447900772094727| 0.3893754184246063| 3667200|               null|0.42447900772094727|0.42447900772094727|                null|15.114938986062503|
|1999-03-23|0.42708298563957214|0.42708298563957214|           0.390625| 0.3984380066394806|0.36548805236816406|

In [15]:
# Specifying properties and target variable
feature_columns = ['Open', 'High', 'Low', 'Volume']  # Örnek özellikler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df)

In [16]:
# Target variable
data = data.withColumn("label", data["Close"])

In [17]:
# Separate training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [18]:
# Model creation and training
lr = LinearRegression(featuresCol='features', labelCol='label')
model = lr.fit(train_data)

24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 1

In [19]:
# Prediction on the test set
predictions = model.transform(test_data)

In [20]:
# Evaluating performance
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

Root Mean Squared Error (RMSE) on test data: 0.8952458650775283


24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/25 12:50:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
