# 1. Import thư viện 

In [66]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 2. Merge Data

In [None]:
data = pd.read_csv("data/data_2019_2025.csv")
df_ratios = pd.read_csv("data/fundamental_ratios_extracted_2019_2025.csv")


data["date"] = pd.to_datetime(data["timestamp"])
df_ratios["report_date"] = pd.to_datetime(
    df_ratios["year"].astype(str) + "Q" + df_ratios["quarter"].astype(str)
)

data = pd.merge_asof(
    data.sort_values("date"),
    df_ratios.sort_values("report_date"),
    by="ticker",
    left_on="date",
    right_on="report_date",
    direction="backward"
)
print("Sau merge:", data.shape)

Sau merge: (90585, 29)


  df_ratios["report_date"] = pd.to_datetime(


# 3. Predict whether this ticker will increase by 10% in 20 days later: 

In [68]:
future_days = 20
threshold = 0.1

data["future_close"] = data.groupby("ticker")["close"].shift(-future_days)
data["future_return"] = (data["future_close"] - data["close"]) / data["close"]
data["label"] = (data["future_return"] >= threshold).astype(int)

print("Số label 1:", data["label"].sum(), "/", len(data))

Số label 1: 15077 / 90585


# 4. SVM regression: 

In [69]:
features = ["RSI_14", "MA20", "MA50", "MA200", "ATR20", "Vol_Avg20", "PE", "ROE"]

df_ml = data.dropna(subset=features + ["label"]).copy()
print("Sau dropna:", df_ml.shape)

if df_ml.empty:
    raise ValueError("❌ Không còn sample nào sau khi dropna. Kiểm tra dữ liệu merge hoặc label.")

X = df_ml[features].values
y = df_ml["label"].values
time_index = df_ml["date"]

# Train/Test/Pred split
train_mask = time_index.dt.year <= 2023
test_mask = time_index.dt.year == 2024
pred_mask = time_index.dt.year == 2025

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]
X_pred, df_pred = X[pred_mask], df_ml[pred_mask]

print("Train samples:", X_train.shape, "Test samples:", X_test.shape, "Predict samples:", X_pred.shape)

if len(X_train) == 0 or len(X_test) == 0:
    raise ValueError("❌ Train/Test rỗng. Kiểm tra lại khoảng thời gian và dữ liệu.")

Sau dropna: (72684, 32)
Train samples: (39214, 8) Test samples: (20105, 8) Predict samples: (13365, 8)


# 5. Grid SearchCV & SMOTE for class imbalancing 

In [70]:
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),  
    ('clf', LinearSVC(max_iter=5000, class_weight="balanced", random_state=42))
])

param_grid = {'clf__C': [0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params: {'clf__C': 1}
Best CV score: 0.3368950285793506


# 6. Validation for 2024 

In [71]:
best_model = grid_search.best_estimator_

y_pred_test = best_model.predict(X_test)
print("\nKết quả trên Test (2024):")
print(classification_report(y_test, y_pred_test, digits=4))
cm_test = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix 2024:\n", cm_test)


Kết quả trên Test (2024):
              precision    recall  f1-score   support

           0     0.8904    0.7274    0.8007     17846
           1     0.1196    0.2926    0.1698      2259

    accuracy                         0.6785     20105
   macro avg     0.5050    0.5100    0.4852     20105
weighted avg     0.8038    0.6785    0.7298     20105

Confusion Matrix 2024:
 [[12981  4865]
 [ 1598   661]]


# 8. Prediction for 2025 

In [72]:
if len(X_pred) > 0:
    y_pred_2025 = best_model.predict(X_pred)
    df_pred = df_pred.copy()
    df_pred["pred"] = y_pred_2025

    # So khớp với label thực tế
    df_pred["correct"] = (df_pred["pred"] == df_pred["label"]).astype(int)
    accuracy_2025 = df_pred["correct"].mean()

    print("\nKết quả dự đoán cho 2025:")
    print(f"Accuracy 2025: {accuracy_2025:.2%}")
    print("\nSo sánh nhãn thực tế vs dự đoán (sample):")
    print(df_pred[["ticker","date","close","future_close","future_return","label","pred"]].head(20))

    # Confusion Matrix 2025
    cm = confusion_matrix(df_pred["label"], df_pred["pred"])
    print("\nConfusion Matrix 2025:")
    print(cm)

    print("\nBáo cáo chi tiết 2025:")
    print(classification_report(df_pred["label"], df_pred["pred"], digits=4))

    # Backtest lợi nhuận chiến lược
    df_pred["strategy_return"] = df_pred["pred"] * df_pred["future_return"]
    total_return = df_pred["strategy_return"].sum()
    avg_return = df_pred["strategy_return"].mean()
    print(f"\nTổng lợi nhuận chiến lược 2025: {total_return:.2f}")
    print(f"Lợi nhuận trung bình mỗi giao dịch: {avg_return:.4f}")
else:
    print("⚠️ Không có dữ liệu 2025 để dự đoán.")


Kết quả dự đoán cho 2025:
Accuracy 2025: 69.49%

So sánh nhãn thực tế vs dự đoán (sample):
      ticker       date       close  future_close  future_return  label  pred
74085    HHV 2025-01-02   11143.080     12095.480       0.085470      0     1
74087    PDR 2025-01-02   19166.130     18332.820      -0.043478      0     0
74089    FTS 2025-01-02   37964.250     37111.625      -0.022459      0     0
74090    PVT 2025-01-02   21174.920     20190.040      -0.046512      0     0
74091    VIC 2025-01-02   40550.000     40450.000      -0.002466      0     0
74092    KDC 2025-01-02   58900.000     58800.000      -0.001698      0     0
74093    POW 2025-01-02   12000.000     11800.000      -0.016667      0     1
74094    GMD 2025-01-02   64068.360     60971.400      -0.048338      0     0
74095    VHC 2025-01-02   71100.000     70500.000      -0.008439      0     0
74097    BMP 2025-01-02  127429.940    118430.380      -0.070624      0     0
74098    HSG 2025-01-02   18120.340     17245.900 

# 9. Potential tickers (having label = 1 & prediction = 1)

In [73]:

if len(X_pred) > 0:
    df_selected = df_pred[(df_pred["label"] == 1) & (df_pred["pred"] == 1)].copy()
    if not df_selected.empty:
        print("\nDanh sách ticker được dự đoán và thực tế tăng (label=1 & pred=1):")
        print(df_selected[["ticker","date","close","future_close","future_return","label","pred"]].reset_index(drop=True))
        
        # Lưu vào CSV
        df_selected[["ticker","date","close","future_close","future_return","label","pred"]].to_csv("SVM_selected.csv", index=False)
        print("\n✅ Đã lưu danh sách vào file SVM_selected.csv")
    else:
        print("\n⚠️ Không có ticker nào mà label=1 và pred=1 trong năm 2025.")



Danh sách ticker được dự đoán và thực tế tăng (label=1 & pred=1):
     ticker       date         close  future_close  future_return  label  pred
0       GEX 2025-01-02  17757.212280  20046.958074       0.128947      1     1
1       SBT 2025-01-02  11681.935000  13500.000000       0.155630      1     1
2       GEE 2025-01-02  26457.298491  35753.106069       0.351351      1     1
3       VSC 2025-01-02  12417.280000  14047.048000       0.131250      1     1
4       GEE 2025-01-03  25781.962043  35117.495294       0.362096      1     1
...     ...        ...           ...           ...            ...    ...   ...
1044    DXS 2025-08-05  11000.000000  13600.000000       0.236364      1     1
1045    PDR 2025-08-05  19000.000000  26000.000000       0.368421      1     1
1046    VSC 2025-08-05  24600.000000  32800.000000       0.333333      1     1
1047    DIG 2025-08-05  21250.000000  25400.000000       0.195294      1     1
1048    HAG 2025-08-05  14900.000000  16450.000000       0.10402

## **Prediction for abnormal period**


### An example: 

* **April 1, 2025**: The United States unexpectedly imposed a **45% tariff** on Vietnamese goods.
* **July 1, 2025**: The tariff was reduced to **20%**.
* Our team will conduct an **evaluation of the model’s predictive capability** during this volatile period.


In [74]:
start_date = pd.to_datetime("2025-04-01")
end_date   = pd.to_datetime("2025-07-01")

mask_period = (df_pred["date"] >= start_date) & (df_pred["date"] <= end_date)
df_period = df_pred[mask_period].copy()

if not df_period.empty:
    X_period = df_period[features].values
    X_period_scaled = best_model.named_steps['scaler'].transform(X_period)
    
    y_pred_period = best_model.named_steps['clf'].predict(X_period_scaled)
    df_period["pred"] = y_pred_period
    df_period["correct"] = (df_period["pred"] == df_period["label"]).astype(int)
    accuracy_period = df_period["correct"].mean()
    
    print(f"\nKết quả dự đoán 02/04/2025 - 02/07/2025:")
    print(f"Accuracy: {accuracy_period:.2%}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(df_period["label"], df_period["pred"]))
    print("\nClassification Report:")
    print(classification_report(df_period["label"], df_period["pred"], digits=4))
    df_period["strategy_return"] = df_period["pred"] * df_period["future_return"]
    total_return_period = df_period["strategy_return"].sum()
    avg_return_period = df_period["strategy_return"].mean()
    print(f"Tổng lợi nhuận chiến lược: {total_return_period:.2f}")
    print(f"Lợi nhuận trung bình mỗi giao dịch: {avg_return_period:.4f}")
else:
    print("⚠️ Không có dữ liệu trong khoảng 02/04/2025 - 02/07/2025 để dự đoán.")



Kết quả dự đoán 02/04/2025 - 02/07/2025:
Accuracy: 62.74%

Confusion Matrix:
[[2454 1026]
 [ 845  697]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7439    0.7052    0.7240      3480
           1     0.4045    0.4520    0.4270      1542

    accuracy                         0.6274      5022
   macro avg     0.5742    0.5786    0.5755      5022
weighted avg     0.6397    0.6274    0.6328      5022

Tổng lợi nhuận chiến lược: 172.73
Lợi nhuận trung bình mỗi giao dịch: 0.0344


**Model Evaluation:**

* The model achieves relatively good accuracy and can be considered a reliable reference for investors, especially during volatile market periods.
* However, Class 1 data remains significantly smaller compared to Class 0. Although SMOTE was applied to balance the data, this imbalance still affects the overall accuracy.

**Proposed Solutions:**

* Collect more data from diverse sources to minimize imbalance.
* Apply more advanced models that can incorporate political, social, and macroeconomic factors to improve predictive accuracy.
* Examples:

  * **Hybrid Models** (combining FA + TA + Macro).
  * Integration of fundamental analysis (FA), technical analysis (TA), and macro-political-social data.
  * For instance, using a **Graph Neural Network (GNN)** to model relationships between industries and political events.

