# **Stock Price Prediction: Research, Benchmarking, and Modeling**

In [None]:
import pandas as pd
import os
import time
import pyarrow.parquet as pq
import polars as pl
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import numpy as np
import matplotlib.pyplot as plt

## **Part 1: Storing and Retrieving Data**

In [None]:
csv_path = "all_stocks_5yr.csv"
df = pd.read_csv(csv_path)
print(df.info())

In [None]:
csv_size = os.path.getsize(csv_path) / (1024 * 1024)
print(f"CSV File Size (1x): {csv_size:.2f} MB")

# Convert to Parquet with Gzip Compression
df.to_parquet("all_stocks_5yr.parquet", engine='pyarrow', compression='gzip', index=False)
parquet_size = os.path.getsize("all_stocks_5yr.parquet") / (1024 * 1024)
print(f"Parquet File Size (1x, Gzip): {parquet_size:.2f} MB")

## **Benchmarking CSV vs Parquet (1x, 10x, 100x)**

In [None]:
df_10x = pd.concat([df] * 10, ignore_index=True)
df_100x = pd.concat([df] * 100, ignore_index=True)

df_10x.to_csv("all_stocks_5yr_10x.csv", index=False)
df_10x.to_parquet("all_stocks_5yr_10x.parquet", compression='gzip')

df_100x.to_csv("all_stocks_5yr_100x.csv", index=False)
df_100x.to_parquet("all_stocks_5yr_100x.parquet", compression='gzip')

In [None]:
benchmark_results = []

def benchmark_read_write(file_path, read_func, write_func, label):
    start_time = time.time()
    df = read_func(file_path)
    read_time = time.time() - start_time
    
    start_time = time.time()
    write_func(df, file_path)
    write_time = time.time() - start_time
    
    file_size = os.path.getsize(file_path) / (1024 * 1024)
    benchmark_results.append([label, file_size, read_time, write_time])

benchmark_read_write("all_stocks_5yr.csv", pd.read_csv, lambda df, path: df.to_csv(path, index=False), "CSV 1x")
benchmark_read_write("all_stocks_5yr.parquet", pd.read_parquet, lambda df, path: df.to_parquet(path, compression='gzip'), "Parquet 1x")
benchmark_read_write("all_stocks_5yr_10x.csv", pd.read_csv, lambda df, path: df.to_csv(path, index=False), "CSV 10x")
benchmark_read_write("all_stocks_5yr_10x.parquet", pd.read_parquet, lambda df, path: df.to_parquet(path, compression='gzip'), "Parquet 10x")
benchmark_read_write("all_stocks_5yr_100x.csv", pd.read_csv, lambda df, path: df.to_csv(path, index=False), "CSV 100x")
benchmark_read_write("all_stocks_5yr_100x.parquet", pd.read_parquet, lambda df, path: df.to_parquet(path, compression='gzip'), "Parquet 100x")

benchmark_df = pd.DataFrame(benchmark_results, columns=["Dataset", "File Size (MB)", "Read Time (s)", "Write Time (s)"])
benchmark_df.to_csv("benchmark_results.csv", index=False)
benchmark_df

## **Part 2: Data Manipulation & Pandas vs Polars Performance**

In [None]:
start_time = time.time()
df_pandas = pd.read_parquet("all_stocks_5yr.parquet")
pandas_load_time = time.time() - start_time

start_time = time.time()
df_polars = pl.read_parquet("all_stocks_5yr.parquet")
polars_load_time = time.time() - start_time

benchmark_pandas_polars = pd.DataFrame({
    "Library": ["Pandas", "Polars"],
    "Load Time (s)": [pandas_load_time, polars_load_time]
})
benchmark_pandas_polars.to_csv("pandas_vs_polars.csv", index=False)
benchmark_pandas_polars

## **Enhancing Data with Technical Indicators**

In [None]:
df['SMA_20'] = df['close'].rolling(20).mean()
df['EMA_20'] = df['close'].ewm(span=20, adjust=False).mean()

def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0).rolling(window).mean()
    loss = -delta.where(delta < 0, 0).rolling(window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df['RSI_14'] = calculate_rsi(df['close'])
df['MACD'] = df['close'].ewm(span=12, adjust=False).mean() - df['close'].ewm(span=26, adjust=False).mean()
df = df.dropna()
print(df.head())

## **Part 3: Building & Evaluating Prediction Models**

In [None]:
features = ["open", "high", "low", "volume", "SMA_20", "EMA_20", "RSI_14", "MACD"]
target = "close"

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "scaler.pkl")

# Train Gradient Boosting Model
gbr_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
gbr_model.fit(X_train_scaled, y_train)
joblib.dump(gbr_model, "gbr_model.pkl")

# Train XGBoost Model
xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
joblib.dump(xgb_model, "xgb_model.pkl")

print("✅ Gradient Boosting and XGBoost Models Saved Correctly!")