In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import cudf, cupy as cp
from cuml.svm import SVC as cuSVC
from cuml.preprocessing import StandardScaler as cuScaler

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

In [4]:
merged_df = pd.read_csv('../Data/merged.csv')
merged_df['Date'] = pd.to_datetime(merged_df['Date'], format='mixed', dayfirst=True, errors='coerce')
merged_df = merged_df.sort_values(["Symbol", "Date"])

merged_df.head()

Unnamed: 0,Symbol,GICS Sector,Headquarters Location,Founded,Date,Close,Volume,News - Positive Sentiment,News - Negative Sentiment,News - New Products,...,Close_lag2,Close_lag3,Close_lag4,Close_lag5,Close_lag6,Close_lag7,Close_lag8,Close_lag9,Close_lag10,Profit
0,A,Health Care,"Santa Clara, California",1999,2020-10-15,105.32,723000,1.0,0.0,0.0,...,105.419998,105.43,105.760002,104.160004,103.879997,101.629997,103.120003,100.010002,101.220001,1
1,A,Health Care,"Santa Clara, California",1999,2020-10-16,106.699997,1039400,0.0,0.0,1.0,...,105.059998,105.419998,105.43,105.760002,104.160004,103.879997,101.629997,103.120003,100.010002,1
2,A,Health Care,"Santa Clara, California",1999,2020-10-19,105.489998,636000,0.0,0.0,1.0,...,105.32,105.059998,105.419998,105.43,105.760002,104.160004,103.879997,101.629997,103.120003,0
3,A,Health Care,"Santa Clara, California",1999,2020-10-20,105.610001,771000,2.0,0.0,0.0,...,106.699997,105.32,105.059998,105.419998,105.43,105.760002,104.160004,103.879997,101.629997,1
4,A,Health Care,"Santa Clara, California",1999,2020-10-21,104.830002,894000,0.0,0.0,0.0,...,105.489998,106.699997,105.32,105.059998,105.419998,105.43,105.760002,104.160004,103.879997,0


In [5]:
gdf = cudf.DataFrame.from_pandas(merged_df)

In [7]:
gdf = cudf.get_dummies(gdf, columns=["GICS Sector"], prefix="GICS", dtype="int8")
lag_cols  = [f"Close_lag{k}" for k in range(1, 11)]
news_cols = [c for c in gdf.columns if c.startswith("News -")]
sector_dummies = [c for c in gdf.columns if c.startswith("GICS_")]
num_features = lag_cols + ["Volume"] + news_cols
feature_cols = num_features + sector_dummies

In [9]:
cutoff_date = gdf["Date"].quantile(0.8)
train_gdf   = gdf[gdf["Date"] <= cutoff_date]
test_gdf    = gdf[gdf["Date"]  > cutoff_date]

X_train = train_gdf[feature_cols]
y_train = train_gdf["Profit"]
X_test  = test_gdf [feature_cols]
y_test  = test_gdf ["Profit"]

In [13]:
svc_gpu = cuSVC(
    kernel="rbf",
    probability=True,
    random_state=42
)
svc_gpu.fit(X_train, y_train)



In [20]:
y_pred_gpu = svc_gpu.predict(X_test)
y_test_np      = cp.asnumpy(y_test)
y_pred_np      = cp.asnumpy(y_pred_gpu)

print("Accuracy :", round(accuracy_score(y_test_np, y_pred_np),4))

Accuracy : 0.4894


In [21]:
train_gdf = train_gdf[train_gdf["Symbol"] == "AAPL"]
test_gdf = test_gdf[test_gdf["Symbol"] == "AAPL"]

X_train = train_gdf[feature_cols]
y_train = train_gdf["Profit"]
X_test  = test_gdf [feature_cols]
y_test  = test_gdf ["Profit"]

svc_gpu = cuSVC(
    kernel="rbf",
    probability=True,
    random_state=42
)
svc_gpu.fit(X_train, y_train)

y_pred_gpu = svc_gpu.predict(X_test)
y_test_np      = cp.asnumpy(y_test)
y_pred_np      = cp.asnumpy(y_pred_gpu)

print("Accuracy :", round(accuracy_score(y_test_np, y_pred_np),4))

Accuracy : 0.5412
