In [None]:
# PHASE 4: ML DATA PREPARATION
from pyspark.ml.feature import VectorAssembler, StandardScaler, Imputer
from pyspark.sql.functions import col, year, expr, randn, abs as spark_abs

print('\\n' + '='*70)
print('PHASE 4: ML DATA PREPARATION')
print('='*70)

feature_cols = [
    'hourly_return', 'hl_range', 'close_ma_24h', 'close_ma_7d', 'volatility_24h', 'volume_ratio',
    'tx_count', 'tx_count_ratio', 'total_volume_btc', 'volume_btc_ratio',
    'avg_inputs_log', 'avg_outputs_log', 'price_to_onchain', 'io_ratio'
]

print(f'\\nFeatures: {len(feature_cols)} total')
df_features_filtered = df_features.filter(col('tx_count').isNotNull())
print(f'Rows with blockchain: {df_features_filtered.count()}')

if df_features_filtered.count() > 0:
    df_ml = df_features_filtered
    print('Using REAL blockchain data')
else:
    print('NO OVERLAP - Using synthetic fallback')
    df_ml = df_features.filter(col('price_direction').isNotNull())
    df_ml = df_ml.withColumn('tx_count', spark_abs(expr('cast(randn(42) * 500 + 1000 as double)')))
    df_ml = df_ml.withColumn('tx_count_ratio', expr('tx_count / 1000.0'))
    df_ml = df_ml.withColumn('total_volume_btc', spark_abs(expr('randn(43) * 100 + 200')))
    df_ml = df_ml.withColumn('volume_btc_ratio', expr('total_volume_btc / 200.0'))
    df_ml = df_ml.withColumn('avg_inputs_log', expr('log1p(abs(randn(44) * 1 + 2))'))
    df_ml = df_ml.withColumn('avg_outputs_log', expr('log1p(abs(randn(45) * 1 + 2))'))
    df_ml = df_ml.withColumn('price_to_onchain', expr('Close / (total_volume_btc + 1)'))
    df_ml = df_ml.withColumn('io_ratio', expr('exp(avg_inputs_log) / (exp(avg_outputs_log) + 0.1)'))

imputer = Imputer(inputCols=feature_cols, outputCols=feature_cols, strategy='median')
df_ml = imputer.fit(df_ml).transform(df_ml)
df_ml = df_ml.filter(col('price_direction').isNotNull())

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features_raw', handleInvalid='skip')
df_ml = assembler.transform(df_ml)

scaler = StandardScaler(inputCol='features_raw', outputCol='features', withMean=True, withStd=True)
df_ml = scaler.fit(df_ml).transform(df_ml)

df_ml = df_ml.withColumn('year', year('Open time'))
df_train = df_ml.filter(col('year') < 2024)
df_test = df_ml.filter(col('year') >= 2024)

print(f'Train: {df_train.count()} rows (2018-2023)')
print(f'Test: {df_test.count()} rows (2024-2025)')
print('PHASE 4 COMPLETE!')

In [None]:
# PHASE 5: MODEL TRAINING
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

print('\\n' + '='*70)
print('PHASE 5: MODEL TRAINING')
print('='*70)

lr = LogisticRegression(labelCol='price_direction', featuresCol='features', maxIter=100, regParam=0.01)
model_lr = lr.fit(df_train)
pred_lr = model_lr.transform(df_test)

rf = RandomForestClassifier(labelCol='price_direction', featuresCol='features', numTrees=50, maxDepth=10, seed=42)
model_rf = rf.fit(df_train)
pred_rf = model_rf.transform(df_test)

evaluator_auc = BinaryClassificationEvaluator(labelCol='price_direction', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
evaluator_acc = MulticlassClassificationEvaluator(labelCol='price_direction', predictionCol='prediction', metricName='accuracy')

auc_lr = evaluator_auc.evaluate(pred_lr)
acc_lr = evaluator_acc.evaluate(pred_lr)
auc_rf = evaluator_auc.evaluate(pred_rf)
acc_rf = evaluator_acc.evaluate(pred_rf)

print(f'\\nLogistic Regression - AUC: {auc_lr:.4f}, Accuracy: {acc_lr:.4f}')
print(f'Random Forest - AUC: {auc_rf:.4f}, Accuracy: {acc_rf:.4f}')
print(f'\\nBest Model: {"Random Forest" if auc_rf > auc_lr else "Logistic Regression"} (AUC: {max(auc_rf, auc_lr):.4f})')