<a href="https://colab.research.google.com/github/Shufen-Yin/Artificial-Intelligence/blob/main/Project_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
# Step 1 — Imports and setup
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib



In [79]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

In [80]:
# Step 2 — Load CSV
INPUT_CSV = "clean_data.csv"
OUTPUT_DIR = "outputs"
MODEL_DIR = "models"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

df = pd.read_csv(INPUT_CSV, parse_dates=["date"], low_memory=False)


In [81]:
# Reset index to avoid alignment issues in groupby operations
df = df.reset_index(drop=True)

print("Loaded data shape:", df.shape)
print("Columns:", df.columns.tolist())

Loaded data shape: (26703, 51)
Columns: ['ticker', 'open', 'close', 'adj_close', 'low', 'high', 'volume', 'date', 'ma_7', 'ma_30', 'volatility_30', 'daily_return', 'future_close', 'sector_CONSUMER NON-DURABLES', 'sector_CONSUMER SERVICES', 'sector_ENERGY', 'sector_FINANCE', 'sector_HEALTH CARE', 'sector_PUBLIC UTILITIES', 'sector_TECHNOLOGY', 'sector_Unknown', 'industry_COMPUTER MANUFACTURING', 'industry_COMPUTER SOFTWARE: PREPACKAGED SOFTWARE', 'industry_INVESTMENT MANAGERS', 'industry_MAJOR BANKS', 'industry_MEAT/POULTRY/FISH', 'industry_MEDICAL SPECIALITIES', 'industry_OIL & GAS PRODUCTION', 'industry_OTHER CONSUMER SERVICES', 'industry_OTHER SPECIALTY STORES', 'industry_REAL ESTATE', 'industry_SEMICONDUCTORS', 'industry_SPECIALTY CHEMICALS', 'industry_TELECOMMUNICATIONS EQUIPMENT', 'industry_Unknown', 'exchange_NYSE', 'name_8X8 INC', 'name_AMERICAN SOFTWARE, INC.', 'name_APOLLO GLOBAL MANAGEMENT, LLC', 'name_APPLE INC.', 'name_ARMADA HOFFLER PROPERTIES, INC.', 'name_BLACKROCK MUNIH

In [82]:
# Step 3 — Indicator calculation functions

def compute_ema(series: pd.Series, span: int) -> pd.Series:
    """Compute exponential moving average."""
    return series.ewm(span=span, adjust=False).mean()


In [83]:
def compute_macd(df: pd.DataFrame, price_col: str = "close") -> pd.DataFrame:
    """Compute MACD and MACD signal per ticker."""
    df = df.copy()
    df["MACD"] = df.groupby("ticker")[price_col].transform(lambda s: compute_ema(s, 12) - compute_ema(s, 26))
    df["MACD_signal"] = df.groupby("ticker")["MACD"].transform(lambda s: compute_ema(s, 9))
    return df
print("MACD and MACD signal computed.")
print(df.head())

MACD and MACD signal computed.
  ticker      open     close  adj_close       low      high    volume  \
0    AHH -0.007841 -0.021475   8.163107 -0.001743 -0.020478  0.421138   
1    AHH -0.017332 -0.004863   8.265792 -0.002944 -0.023999  0.980897   
2    AHH -0.013773 -0.010796   8.229120 -0.053383 -0.021652  0.080530   
3    AHH -0.004282  0.008190   8.346467 -0.019757 -0.008744 -0.288463   
4    AHH  0.014699  0.008190   8.346467 -0.029365 -0.001704  0.228391   

        date      ma_7     ma_30  volatility_30  daily_return  future_close  \
0 2013-06-20  0.037043  0.133896      -3.641527     -0.322969         11.56   
1 2013-06-21  0.026048  0.128910      -3.637299     -0.320840         11.49   
2 2013-06-24  0.012554  0.122143      -3.632317     -0.322152         11.49   
3 2013-06-25  0.009056  0.117335      -3.631825     -0.320711         11.45   
4 2013-06-26  0.000559  0.114664      -3.630818     -0.321810         11.55   

   sector_CONSUMER NON-DURABLES  sector_CONSUMER SERVIC

In [84]:
def compute_rsi(df: pd.DataFrame, price_col: str = "close", period: int = 14) -> pd.DataFrame:
    """Compute RSI per ticker."""
    df = df.copy()

    def rsi_for_series(close: pd.Series) -> pd.Series:
        delta = close.diff()
        gain = delta.where(delta > 0, 0.0)
        loss = -delta.where(delta < 0, 0.0)

        avg_gain = gain.ewm(span=period, adjust=False).mean()
        avg_loss = loss.ewm(span=period, adjust=False).mean()

        rs = avg_gain / (avg_loss.replace(0, np.nan))
        rsi = 100 - (100 / (1 + rs))
        return rsi.fillna(50)

    df["RSI"] = df.groupby("ticker")[price_col].transform(rsi_for_series)
    return df


In [85]:
# Step 4 — Generate signals
def generate_indicator_signals(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generate MACD, RSI, and combined Buy/Sell/Hold signals per ticker.
    """
    df = df.copy().sort_values(["ticker", "date"])

    # MACD cross detection
    df["MACD_prev"] = df.groupby("ticker")["MACD"].shift(1)
    df["MACD_signal_prev"] = df.groupby("ticker")["MACD_signal"].shift(1)

    def macd_signal_row(row):
        """Generate MACD signal for one row based on crossover."""
        if pd.isna(row["MACD_prev"]) or pd.isna(row["MACD_signal_prev"]):
            return "Neutral"
        if (row["MACD_prev"] < row["MACD_signal_prev"]) and (row["MACD"] > row["MACD_signal"]):
            return "Buy"
        if (row["MACD_prev"] > row["MACD_signal_prev"]) and (row["MACD"] < row["MACD_signal"]):
            return "Sell"
        return "Neutral"

    df["MACD_flag"] = df.apply(macd_signal_row, axis=1)

    # RSI flag
    df["RSI_flag"] = df["RSI"].apply(
        lambda r: "Buy" if r < 30 else ("Sell" if r > 70 else "Neutral")
    )

    # Combined signal
    def combined_flag(row):
        if row["MACD_flag"] == "Buy" and row["RSI_flag"] == "Buy":
            return "Buy"
        if row["MACD_flag"] == "Sell" and row["RSI_flag"] == "Sell":
            return "Sell"
        return "Hold"

    df["signal"] = df.apply(combined_flag, axis=1)

    # Drop temporary columns
    df.drop(columns=["MACD_prev", "MACD_signal_prev"], inplace=True)

    return df


In [86]:
#  Compute MACD and -NOT REQUIRED
df = compute_macd(df)
df = compute_rsi(df)

#  Generate Buy/Hold/Sell signals
df = generate_indicator_signals(df)

# Quick check
print("Columns after signal generation:", df.columns.tolist())
print(df[["ticker","date","MACD","MACD_signal","RSI","MACD_flag","RSI_flag","signal"]].head())

Columns after signal generation: ['ticker', 'open', 'close', 'adj_close', 'low', 'high', 'volume', 'date', 'ma_7', 'ma_30', 'volatility_30', 'daily_return', 'future_close', 'sector_CONSUMER NON-DURABLES', 'sector_CONSUMER SERVICES', 'sector_ENERGY', 'sector_FINANCE', 'sector_HEALTH CARE', 'sector_PUBLIC UTILITIES', 'sector_TECHNOLOGY', 'sector_Unknown', 'industry_COMPUTER MANUFACTURING', 'industry_COMPUTER SOFTWARE: PREPACKAGED SOFTWARE', 'industry_INVESTMENT MANAGERS', 'industry_MAJOR BANKS', 'industry_MEAT/POULTRY/FISH', 'industry_MEDICAL SPECIALITIES', 'industry_OIL & GAS PRODUCTION', 'industry_OTHER CONSUMER SERVICES', 'industry_OTHER SPECIALTY STORES', 'industry_REAL ESTATE', 'industry_SEMICONDUCTORS', 'industry_SPECIALTY CHEMICALS', 'industry_TELECOMMUNICATIONS EQUIPMENT', 'industry_Unknown', 'exchange_NYSE', 'name_8X8 INC', 'name_AMERICAN SOFTWARE, INC.', 'name_APOLLO GLOBAL MANAGEMENT, LLC', 'name_APPLE INC.', 'name_ARMADA HOFFLER PROPERTIES, INC.', 'name_BLACKROCK MUNIHOLDINGS

In [88]:
df = generate_indicator_signals(df)

print("signal in df? ->", "signal" in df.columns)
print(df[["ticker", "date", "MACD", "MACD_signal", "RSI", "MACD_flag", "RSI_flag", "signal"]].head())

signal in df? -> True
     ticker       date      MACD  MACD_signal        RSI MACD_flag RSI_flag  \
5987   AAPL 1983-01-11  0.000000     0.000000  50.000000   Neutral  Neutral   
0       AHH 2013-06-20  0.000000     0.000000  50.000000   Neutral  Neutral   
1       AHH 2013-06-21  0.001325     0.000265  50.000000   Neutral  Neutral   
2       AHH 2013-06-24  0.001875     0.000587  70.817092   Neutral     Sell   
3       AHH 2013-06-25  0.003799     0.001229  85.952991   Neutral     Sell   

     signal  
5987   Hold  
0      Hold  
1      Hold  
2      Hold  
3      Hold  


In [89]:
# Step 5 — Prepare features and labels

def prepare_ml_dataset(df: pd.DataFrame, features: list):
    """
    Prepare features and labels for ML based on signal column.

    Args:
        df (pd.DataFrame): DataFrame containing 'signal' column.
        features (list): List of feature column names to use as X.

    Returns:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target labels (0=Hold, 1=Buy, 2=Sell).
        df (pd.DataFrame): Original DataFrame with 'label' column added.
    """
    df = df.copy()

    # Map Buy/Hold/Sell signals to numeric labels
    df["label"] = df["signal"].map({"Buy": 1, "Hold": 0, "Sell": 2})

    # Features and target
    X = df[features].copy()
    y = df["label"].copy()

    return X, y, df


In [91]:
# Step 6 — Split data for training/testing

from sklearn.model_selection import train_test_split

features = ["MACD", "MACD_signal", "RSI"]
X, y, df_full = prepare_ml_dataset(df, features)

print("ML dataset shape:", X.shape)
print("Label distribution:\n", y.value_counts())

# Time-series safe split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.17647, shuffle=False
)

print("Shapes — Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


ML dataset shape: (26703, 3)
Label distribution:
 label
0    26685
2       10
1        8
Name: count, dtype: int64
Shapes — Train: (18691, 3) Val: (4006, 3) Test: (4006, 3)


In [92]:
# Step 7 — Train & evaluate Logistic Regression, Random Forest, SVM- Perform all models one by one and then pas them through
# the pipeline concept

def train_and_evaluate(X_train, X_test, y_train, y_test):
    trained_models = {}
    results = []

    # Models and hyperparameter grids
    pipelines = {
        "LogisticRegression": (Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=200, class_weight="balanced"))
        ]), {
            "clf__C": [0.01, 0.1, 1.0]
        }),
        "RandomForest": (Pipeline([
            ("scaler", StandardScaler()),
            ("clf", RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=42))
        ]), {
            "clf__n_estimators": [50, 100],
            "clf__max_depth": [5, 10, None]
        }),
        "SVM": (Pipeline([
            ("scaler", StandardScaler()),
            ("clf", SVC(class_weight="balanced", probability=True))
        ]), {
            "clf__C": [0.1, 1],
            "clf__kernel": ["rbf"]
        })
    }

    skf = StratifiedKFold(n_splits=3, shuffle=False)

    for name, (pipe, params) in pipelines.items():
        print(f"\nTraining {name}...")
        grid = GridSearchCV(pipe, params, cv=skf, scoring="f1_weighted", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        trained_models[name] = best_model

        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        rec = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        print(f"{name} — Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

        results.append({
            "model": name,
            "best_params": grid.best_params_,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_weighted": f1
        })

    return trained_models, results


In [93]:
# Step 8 — Main workflow: train models, save models & predictions
trained_models, results = train_and_evaluate(X_train, X_test, y_train, y_test)

# Save models and predictions
for model_name, model in trained_models.items():
    model_path = os.path.join(MODEL_DIR, f"{model_name.lower()}_model.joblib")
    joblib.dump(model, model_path)
    print(f"Saved {model_name} to {model_path}")

    # Save predictions
    df_pred = df_full.loc[X.index, ["date", "ticker", "close", "signal"]].copy()
    df_pred[f"pred_{model_name}"] = model.predict(X)
    df_pred.to_csv(os.path.join(OUTPUT_DIR, f"predictions_{model_name}.csv"), index=False)
    print(f"Saved predictions for {model_name}")



Training LogisticRegression...
LogisticRegression — Accuracy: 0.8258, Precision: 0.9995, Recall: 0.8258, F1: 0.9041
Confusion Matrix:
 [[3306  329  369]
 [   0    1    0]
 [   0    0    1]]

Training RandomForest...
RandomForest — Accuracy: 0.9995, Precision: 0.9990, Recall: 0.9995, F1: 0.9993
Confusion Matrix:
 [[4004    0    0]
 [   1    0    0]
 [   1    0    0]]

Training SVM...
SVM — Accuracy: 0.9873, Precision: 0.9993, Recall: 0.9873, F1: 0.9931
Confusion Matrix:
 [[3954    4   46]
 [   1    0    0]
 [   0    0    1]]
Saved LogisticRegression to models/logisticregression_model.joblib
Saved predictions for LogisticRegression
Saved RandomForest to models/randomforest_model.joblib
Saved predictions for RandomForest
Saved SVM to models/svm_model.joblib
Saved predictions for SVM


In [49]:
# Step 9 — Evaluation summary
summary_df = pd.DataFrame(results)
summary_df.to_csv(os.path.join(OUTPUT_DIR, "evaluation_summary.csv"), index=False)
print("Saved evaluation summary to:", os.path.join(OUTPUT_DIR, "evaluation_summary.csv"))

Saved evaluation summary to: outputs/evaluation_summary.csv
