In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, precision_recall_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt
import joblib

In [None]:
def convert_ticks_to_candles(data, tick_count=20):
    candles = []
    data_sorted = data.sort_values('timestamp').reset_index(drop=True)
    for i in range(0, len(data_sorted), tick_count):
        chunk = data_sorted.iloc[i:i+tick_count]
        if chunk.empty:
            continue
        candles.append({
            'timestamp': chunk['timestamp'].iloc[0],
            'open':      chunk['price'].iloc[0],
            'high':      chunk['price'].max(),
            'low':       chunk['price'].min(),
            'close':     chunk['price'].iloc[-1],
            'volume':    len(chunk)
        })
    return pd.DataFrame(candles)


def calculate_rsi(prices, period=14):
    delta = prices.diff()
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))


def calculate_technical_indicators(df):
    df = df.copy()
    df['ema_9']  = df['close'].ewm(span=9,  adjust=False).mean()
    df['ema_15'] = df['close'].ewm(span=15, adjust=False).mean()
    df['ema_21'] = df['close'].ewm(span=21, adjust=False).mean()
    def _adx(high, low, close, period=5):
        tr1 = high - low
        tr2 = np.abs(high - close.shift(1))
        tr3 = np.abs(low  - close.shift(1))
        tr  = np.maximum(tr1, np.maximum(tr2, tr3))
        dm_plus  = np.where((high - high.shift(1)) > (low.shift(1) - low),
                             np.maximum(high - high.shift(1), 0), 0)
        dm_minus = np.where((low.shift(1) - low) > (high - high.shift(1)),
                             np.maximum(low.shift(1) - low, 0), 0)
        tr_s   = pd.Series(tr).ewm(span=period, adjust=False).mean()
        dmp_s  = pd.Series(dm_plus).ewm(span=period, adjust=False).mean()
        dmm_s  = pd.Series(dm_minus).ewm(span=period, adjust=False).mean()
        di_plus  = 100 * (dmp_s / tr_s)
        di_minus = 100 * (dmm_s / tr_s)
        dx       = 100 * np.abs(di_plus - di_minus) / (di_plus + di_minus)
        return dx.ewm(span=period, adjust=False).mean()
    df['adx'] = _adx(df['high'], df['low'], df['close'])
    df['is_green']    = df['close'] > df['open']
    df['returns']     = df['close'].pct_change()
    df['body_size']   = np.abs(df['close'] - df['open']) / df['close']
    df['upper_wick']  = (df['high'] - df[['open','close']].max(axis=1)) / df['close']
    df['lower_wick']  = (df[['open','close']].min(axis=1) - df['low']) / df['close']
    df['price_range'] = (df['high'] - df['low']) / df['close']
    df['volatility']   = df['returns'].rolling(5).std()
    df['volume_sma']   = df['volume'].rolling(5).mean()
    df['volume_ratio'] = df['volume'] / df['volume_sma']
    df['rsi']            = calculate_rsi(df['close'])
    df['price_position'] = (df['close'] - df['low'].rolling(10).min()) / (
                              df['high'].rolling(10).max() - df['low'].rolling(10).min())
    return df


def create_enhanced_features(df, lookback=5):
    features, labels = [], []
    start = max(25, lookback + 2)
    for i in range(start, len(df)-1):
        if df['is_green'].iloc[i-2] and df['is_green'].iloc[i-1]:
            curr = df.iloc[i-1]
            base = [
                curr['ema_9'] - curr['ema_15'], curr['ema_9'] - curr['ema_21'],
                curr['adx'], curr['rsi'], curr['volatility'], curr['volume_ratio'],
                curr['body_size'], curr['upper_wick'], curr['lower_wick'],
                curr['price_position'], curr['returns'], curr['price_range']
            ]
            hist = []
            for j in range(1, lookback+1):
                if i-1-j >= 0:
                    prev = df.iloc[i-1-j]
                    hist += [prev['returns'], prev['body_size'], int(prev['is_green'])]
                else:
                    hist += [0,0,0]
            row = np.nan_to_num(np.array(base + hist), nan=0, posinf=0, neginf=0)
            features.append(row)
            labels.append(int(df['is_green'].iloc[i]))
    return np.array(features), np.array(labels)

def train_and_save_model(train_file, model_path, tick_count=20, lookback=5):
    ticks   = pd.read_csv(train_file)
    candles = convert_ticks_to_candles(ticks, tick_count)
    candles = calculate_technical_indicators(candles)
    X, y    = create_enhanced_features(candles, lookback)

    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    rf = RandomForestClassifier(random_state=42, class_weight='balanced')
    grid = GridSearchCV(rf, {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }, cv=3, scoring='precision', n_jobs=-1)
    grid.fit(X_tr, y_tr)
    best_model = grid.best_estimator_

    proba = best_model.predict_proba(X_val)[:,1]
    precisions, recalls, thresholds = precision_recall_curve(y_val, proba)
    idx_perfect = np.where(precisions[:-1] == 1.0)[0]
    if idx_perfect.size > 0:
        best_idx = idx_perfect[np.argmax(recalls[idx_perfect])]
    else:
        best_idx = np.argmax(precisions[:-1])
    best_thresh = thresholds[best_idx]

    joblib.dump({'model': best_model, 'threshold': best_thresh}, model_path)
    print(f"[+] Saved RF + perfect-precision threshold={best_thresh:.3f} to {model_path}")
    return best_model, best_thresh

def plot_confusion_matrix(y_true, y_pred, labels=["Not Green", "Green"]):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap='Blues', values_format='d')
    plt.title("Confusion Matrix")
    plt.grid(False)
    plt.show()


def load_model(model_path):
    return joblib.load(model_path)


def evaluate_on_test(test_file, loaded, tick_count=20, lookback=5):
    model = loaded['model']
    threshold = max(0.0, loaded['threshold'] - 0.2) 
    ticks = pd.read_csv(test_file)
    candles = convert_ticks_to_candles(ticks, tick_count)
    candles = calculate_technical_indicators(candles)
    X_test, y_test = create_enhanced_features(candles, lookback)

    proba = model.predict_proba(X_test)[:, 1]
    y_pred = (proba >= threshold).astype(int)

    p = precision_score(y_test, y_pred)
    r = recall_score(y_test, y_pred)
    a = accuracy_score(y_test, y_pred)
    print(f"Precision @thr {threshold:.3f}: {p:.4f}")
    print(f"Recall:                 {r:.4f}")
    print(f"Accuracy:               {a:.4f}")

    plot_confusion_matrix(y_test, y_pred)

    return p, r, a

if __name__ == '__main__':
    TRAIN_FILE = '/Users/sushanth/Desktop/Nano_scalping/Crypto/Data/btcusdc_ticks.csv'
    TEST_FILE  = '/Users/sushanth/Desktop/Nano_scalping/Crypto/Data/BTC_USDC/binance_btcusdc_trades.csv'
    MODEL_FILE = 'rf_third_green_precision.pkl'

    model, thresh = train_and_save_model(TRAIN_FILE, MODEL_FILE, tick_count=10, lookback=5)
    loaded = load_model(MODEL_FILE)
    evaluate_on_test(TEST_FILE, loaded, tick_count=100, lookback=5)
