# ü´Ä Heart Disease Prediction - Modeling

Training Logistic Regression v·ªõi K-Fold Cross Validation s·ª≠ d·ª•ng PySpark.

## üì¶ 1. Import Libraries & Initialize Spark

In [None]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Setup PySpark
if 'SPARK_HOME' in os.environ:
    del os.environ['SPARK_HOME']
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc

# Style settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Initialize Spark
spark = SparkSession.builder.appName('HeartDiseaseModeling').getOrCreate()
print('‚úÖ Spark Session Created Successfully!')
spark

## üìÇ 2. Load Processed Data

In [None]:
# Load processed data
data_path = '../data/processed/cardio_processed.parquet'

if os.path.exists(data_path):
    df = spark.read.parquet(data_path)
    print(f'‚úÖ Loaded processed data: {df.count()} rows')
else:
    print('‚ö†Ô∏è Processed data not found, loading raw data...')
    df = spark.read.csv('../data/raw/cardio_train.csv', header=True, sep=',', inferSchema=True)
    
    # Quick preprocessing
    pdf = df.toPandas()
    pdf['age_years'] = pdf['age'] / 365
    pdf['bmi'] = pdf['weight'] / ((pdf['height'] / 100) ** 2)
    pdf['pulse_pressure'] = pdf['ap_hi'] - pdf['ap_lo']
    
    df = spark.createDataFrame(pdf)
    feature_cols = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi', 'pulse_pressure']
    assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
    df = assembler.transform(df)
    scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=True)
    df = scaler.fit(df).transform(df)
    print(f'‚úÖ Preprocessed raw data: {df.count()} rows')

df.select('scaled_features', 'cardio').show(5, truncate=False)

## üìä 3. Train/Test Split

In [None]:
# Split data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

train_count = train_data.count()
test_count = test_data.count()
total = train_count + test_count

print(f'üìä Training Data: {train_count:,} ({train_count/total*100:.1f}%)')
print(f'üìä Test Data: {test_count:,} ({test_count/total*100:.1f}%)')

In [None]:
# Visualize split
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Pie chart - Split ratio
axes[0].pie([train_count, test_count], labels=['Train (80%)', 'Test (20%)'],
            autopct='%1.1f%%', colors=['#3498db', '#e74c3c'], explode=(0.02, 0.02), shadow=True)
axes[0].set_title('üìä Train/Test Split', fontsize=14, fontweight='bold')

# Class distribution in train
train_pdf = train_data.select('cardio').toPandas()
test_pdf = test_data.select('cardio').toPandas()

sns.countplot(x='cardio', data=train_pdf, ax=axes[1], palette=['#2ecc71', '#e74c3c'])
axes[1].set_title('Training Set Class Distribution', fontsize=12, fontweight='bold')
axes[1].set_xticklabels(['Healthy', 'Disease'])
for i, v in enumerate(train_pdf['cardio'].value_counts().sort_index()):
    axes[1].text(i, v + 200, f'{v:,}', ha='center', fontweight='bold')

# Class distribution in test
sns.countplot(x='cardio', data=test_pdf, ax=axes[2], palette=['#2ecc71', '#e74c3c'])
axes[2].set_title('Test Set Class Distribution', fontsize=12, fontweight='bold')
axes[2].set_xticklabels(['Healthy', 'Disease'])
for i, v in enumerate(test_pdf['cardio'].value_counts().sort_index()):
    axes[2].text(i, v + 50, f'{v:,}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../results/12_train_test_split.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/12_train_test_split.png')

## üéØ 4. Logistic Regression with K-Fold Cross Validation

In [None]:
# Define Logistic Regression
lr = LogisticRegression(featuresCol='scaled_features', labelCol='cardio', maxIter=100)

# Parameter Grid
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

print(f'üìä Total parameter combinations: {len(paramGrid)}')
print('\nüîß Parameters to tune:')
print('  ‚Ä¢ regParam: [0.001, 0.01, 0.1]')
print('  ‚Ä¢ elasticNetParam: [0.0, 0.5, 1.0]')

In [None]:
# K-Fold Cross Validator (K=5)
evaluator = BinaryClassificationEvaluator(labelCol='cardio', metricName='areaUnderROC')

cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,
    parallelism=2
)

print('üöÄ Training with 5-Fold Cross Validation...')
print('‚è≥ This may take a few minutes...')

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel

print('\n‚úÖ Training completed!')
print(f'\nüèÜ Best Model Parameters:')
print(f'  ‚Ä¢ regParam: {bestModel.getRegParam()}')
print(f'  ‚Ä¢ elasticNetParam: {bestModel.getElasticNetParam()}')

In [None]:
# Visualize Cross Validation Results
avg_metrics = cvModel.avgMetrics

# Create parameter labels
param_labels = []
for params in paramGrid:
    reg = params[lr.regParam]
    elastic = params[lr.elasticNetParam]
    param_labels.append(f'reg={reg}\nelastic={elastic}')

fig, ax = plt.subplots(figsize=(14, 6))
colors = plt.cm.viridis(np.linspace(0, 0.8, len(avg_metrics)))
bars = ax.bar(range(len(avg_metrics)), avg_metrics, color=colors, edgecolor='black')
ax.set_xticks(range(len(param_labels)))
ax.set_xticklabels(param_labels, rotation=45, ha='right', fontsize=9)
ax.set_xlabel('Parameter Combination')
ax.set_ylabel('Average ROC-AUC (5-Fold CV)')
ax.set_title('üìä 5-Fold Cross Validation Results', fontsize=14, fontweight='bold')
ax.set_ylim([min(avg_metrics) - 0.02, max(avg_metrics) + 0.02])

# Highlight best
best_idx = avg_metrics.index(max(avg_metrics))
bars[best_idx].set_color('#e74c3c')
bars[best_idx].set_edgecolor('black')
bars[best_idx].set_linewidth(2)

for i, v in enumerate(avg_metrics):
    ax.text(i, v + 0.002, f'{v:.4f}', ha='center', fontsize=8, fontweight='bold' if i == best_idx else 'normal')

plt.tight_layout()
plt.savefig('../results/13_cv_results.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/13_cv_results.png')

## üìà 5. Model Evaluation on Test Set

In [None]:
# Make predictions
predictions = bestModel.transform(test_data)

# Calculate metrics
accuracy = MulticlassClassificationEvaluator(labelCol='cardio', metricName='accuracy').evaluate(predictions)
precision = MulticlassClassificationEvaluator(labelCol='cardio', metricName='weightedPrecision').evaluate(predictions)
recall = MulticlassClassificationEvaluator(labelCol='cardio', metricName='weightedRecall').evaluate(predictions)
f1 = MulticlassClassificationEvaluator(labelCol='cardio', metricName='f1').evaluate(predictions)
roc_auc = BinaryClassificationEvaluator(labelCol='cardio', metricName='areaUnderROC').evaluate(predictions)
pr_auc = BinaryClassificationEvaluator(labelCol='cardio', metricName='areaUnderPR').evaluate(predictions)

print('üìä Model Evaluation Metrics:')
print('=' * 40)
print(f'  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)')
print(f'  Precision: {precision:.4f}')
print(f'  Recall:    {recall:.4f}')
print(f'  F1-Score:  {f1:.4f}')
print(f'  ROC-AUC:   {roc_auc:.4f}')
print(f'  PR-AUC:    {pr_auc:.4f}')

In [None]:
# Visualize metrics
fig, ax = plt.subplots(figsize=(10, 6))

metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC']
metrics_values = [accuracy, precision, recall, f1, roc_auc, pr_auc]
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12', '#1abc9c']

bars = ax.bar(metrics_names, metrics_values, color=colors, edgecolor='black', linewidth=1.5)
ax.set_ylim([0, 1.1])
ax.set_ylabel('Score')
ax.set_title('üìä Model Evaluation Metrics', fontsize=14, fontweight='bold')
ax.axhline(y=0.7, color='gray', linestyle='--', alpha=0.5, label='Baseline (0.7)')

for bar, val in zip(bars, metrics_values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', ha='center', fontweight='bold', fontsize=11)

plt.tight_layout()
plt.savefig('../results/14_evaluation_metrics.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/14_evaluation_metrics.png')

## üéØ 6. Confusion Matrix

In [None]:
# Get predictions as pandas
pred_pdf = predictions.select('cardio', 'prediction').toPandas()
y_true = pred_pdf['cardio'].values
y_pred = pred_pdf['prediction'].values

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Heatmap - counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Healthy', 'Disease'], yticklabels=['Healthy', 'Disease'],
            cbar_kws={'shrink': 0.8})
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('üìä Confusion Matrix (Counts)', fontsize=12, fontweight='bold')

# Heatmap - percentages
cm_pct = cm.astype('float') / cm.sum() * 100
sns.heatmap(cm_pct, annot=True, fmt='.1f', cmap='Greens', ax=axes[1],
            xticklabels=['Healthy', 'Disease'], yticklabels=['Healthy', 'Disease'],
            cbar_kws={'shrink': 0.8})
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('üìä Confusion Matrix (Percentages %)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/15_confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/15_confusion_matrix.png')

# Print detailed metrics
tn, fp, fn, tp = cm.ravel()
print(f'\nüìä Detailed Metrics:')
print(f'  True Negatives:  {tn:,}')
print(f'  False Positives: {fp:,}')
print(f'  False Negatives: {fn:,}')
print(f'  True Positives:  {tp:,}')

## üìà 7. ROC Curve

In [None]:
# Get probability scores
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

# Extract probability of positive class
def extract_prob(v):
    return float(v[1])

extract_prob_udf = udf(extract_prob, FloatType())
pred_with_prob = predictions.withColumn('prob_positive', extract_prob_udf('probability'))
prob_pdf = pred_with_prob.select('cardio', 'prob_positive').toPandas()

y_true = prob_pdf['cardio'].values
y_scores = prob_pdf['prob_positive'].values

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc_score = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10, 8))
ax.plot(fpr, tpr, color='#e74c3c', lw=3, label=f'ROC Curve (AUC = {roc_auc_score:.4f})')
ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Classifier')
ax.fill_between(fpr, tpr, alpha=0.3, color='#e74c3c')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('üìà ROC Curve', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/16_roc_curve.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/16_roc_curve.png')

## üìà 8. Precision-Recall Curve

In [None]:
# Calculate Precision-Recall curve
precision_curve, recall_curve, thresholds_pr = precision_recall_curve(y_true, y_scores)
pr_auc_score = auc(recall_curve, precision_curve)

fig, ax = plt.subplots(figsize=(10, 8))
ax.plot(recall_curve, precision_curve, color='#3498db', lw=3, label=f'PR Curve (AUC = {pr_auc_score:.4f})')
ax.fill_between(recall_curve, precision_curve, alpha=0.3, color='#3498db')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('üìà Precision-Recall Curve', fontsize=14, fontweight='bold')
ax.legend(loc='lower left', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/17_precision_recall_curve.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/17_precision_recall_curve.png')

## üìä 9. Feature Importance

In [None]:
# Get feature coefficients
coefficients = bestModel.coefficients.toArray()

# Load feature names
try:
    with open('../data/processed/feature_columns.txt', 'r') as f:
        feature_names = [line.strip() for line in f.readlines()]
except:
    feature_names = [f'Feature_{i}' for i in range(len(coefficients))]

# Create DataFrame
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
}).sort_values('Abs_Coefficient', ascending=True)

print('üìä Feature Coefficients:')
for _, row in feature_importance.iterrows():
    sign = '+' if row['Coefficient'] > 0 else '-'
    print(f"  {sign} {row['Feature']}: {row['Coefficient']:.4f}")

In [None]:
# Visualize feature importance
fig, ax = plt.subplots(figsize=(12, 8))

colors = ['#e74c3c' if c < 0 else '#2ecc71' for c in feature_importance['Coefficient']]
ax.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors, edgecolor='black')
ax.set_xlabel('Coefficient Value')
ax.set_title('üìä Feature Importance (Logistic Regression Coefficients)', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linewidth=0.8)

for i, (idx, row) in enumerate(feature_importance.iterrows()):
    ax.text(row['Coefficient'] + 0.01 if row['Coefficient'] >= 0 else row['Coefficient'] - 0.01,
            i, f"{row['Coefficient']:.3f}", va='center', ha='left' if row['Coefficient'] >= 0 else 'right', fontsize=9)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#2ecc71', label='Positive (‚Üë Risk)'),
                   Patch(facecolor='#e74c3c', label='Negative (‚Üì Risk)')]
ax.legend(handles=legend_elements, loc='lower right')

plt.tight_layout()
plt.savefig('../results/18_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/18_feature_importance.png')

## üìä 10. K-Fold Performance Analysis

In [None]:
# K-Fold scores analysis
cv_scores = cvModel.avgMetrics
best_cv_score = max(cv_scores)
mean_score = np.mean(cv_scores)
std_score = np.std(cv_scores)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
axes[0].boxplot(cv_scores, vert=True, patch_artist=True,
                boxprops=dict(facecolor='#3498db', color='black'),
                medianprops=dict(color='red', linewidth=2))
axes[0].scatter([1], [best_cv_score], color='#e74c3c', s=100, zorder=5, label=f'Best: {best_cv_score:.4f}')
axes[0].set_ylabel('ROC-AUC Score')
axes[0].set_title('üì¶ Cross-Validation Scores Distribution', fontsize=12, fontweight='bold')
axes[0].set_xticklabels(['5-Fold CV'])
axes[0].legend()

# Line plot
axes[1].plot(range(1, len(cv_scores)+1), cv_scores, 'o-', color='#3498db', linewidth=2, markersize=8)
axes[1].axhline(y=mean_score, color='#2ecc71', linestyle='--', linewidth=2, label=f'Mean: {mean_score:.4f}')
axes[1].fill_between(range(1, len(cv_scores)+1), mean_score - std_score, mean_score + std_score, alpha=0.2, color='#2ecc71')
axes[1].set_xlabel('Parameter Combination')
axes[1].set_ylabel('ROC-AUC Score')
axes[1].set_title('üìà CV Scores by Parameter Combination', fontsize=12, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig('../results/19_kfold_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úÖ Chart saved to ../results/19_kfold_analysis.png')
print(f'\nüìä CV Statistics: Mean={mean_score:.4f}, Std={std_score:.4f}')

## üíæ 11. Save Model & Results

In [None]:
# Save the best model
model_path = '../model/logistic_regression_model'
bestModel.write().overwrite().save(model_path)
print(f'‚úÖ Model saved to {model_path}')

In [None]:
# Save model weights
with open('../model/model_weights.txt', 'w') as f:
    f.write('=' * 50 + '\n')
    f.write('LOGISTIC REGRESSION MODEL WEIGHTS\n')
    f.write('=' * 50 + '\n\n')
    f.write(f'Intercept: {bestModel.intercept}\n\n')
    f.write('Coefficients:\n')
    for i, (feat, coef) in enumerate(zip(feature_names, coefficients)):
        f.write(f'  {feat}: {coef:.6f}\n')
    f.write('\n' + '=' * 50 + '\n')
    f.write('HYPERPARAMETERS\n')
    f.write('=' * 50 + '\n\n')
    f.write(f'regParam: {bestModel.getRegParam()}\n')
    f.write(f'elasticNetParam: {bestModel.getElasticNetParam()}\n')

print('‚úÖ Model weights saved to ../model/model_weights.txt')

In [None]:
# Save evaluation metrics
metrics_dict = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'ROC-AUC': roc_auc,
    'PR-AUC': pr_auc
}

with open('../results/model_metrics.txt', 'w') as f:
    f.write('=' * 40 + '\n')
    f.write('MODEL EVALUATION METRICS\n')
    f.write('=' * 40 + '\n\n')
    for metric, value in metrics_dict.items():
        f.write(f'{metric}: {value:.4f}\n')
    f.write('\n' + '=' * 40 + '\n')
    f.write('CONFUSION MATRIX\n')
    f.write('=' * 40 + '\n\n')
    f.write(f'True Negatives:  {tn:,}\n')
    f.write(f'False Positives: {fp:,}\n')
    f.write(f'False Negatives: {fn:,}\n')
    f.write(f'True Positives:  {tp:,}\n')

print('‚úÖ Metrics saved to ../results/model_metrics.txt')

## üìä 12. Final Summary

In [None]:
print('=' * 60)
print('üéâ MODELING COMPLETED SUCCESSFULLY!')
print('=' * 60)
print(f'\nüìä Model Performance:')
print(f'  ‚Ä¢ Accuracy:  {accuracy*100:.2f}%')
print(f'  ‚Ä¢ ROC-AUC:   {roc_auc:.4f}')
print(f'  ‚Ä¢ F1-Score:  {f1:.4f}')
print(f'\nüèÜ Best Hyperparameters:')
print(f'  ‚Ä¢ regParam: {bestModel.getRegParam()}')
print(f'  ‚Ä¢ elasticNetParam: {bestModel.getElasticNetParam()}')
print(f'\nüíæ Saved Files:')
print(f'  ‚Ä¢ model/logistic_regression_model/')
print(f'  ‚Ä¢ model/model_weights.txt')
print(f'  ‚Ä¢ results/model_metrics.txt')
print(f'  ‚Ä¢ results/*.png (8 visualization charts)')
print('\n‚úÖ Ready for deployment!')

In [None]:
# Stop Spark
print('üí° Run spark.stop() when done to release resources')

---
## üìå Charts Generated

| # | Chart | File |
|---|-------|------|
| 12 | Train/Test Split | 12_train_test_split.png |
| 13 | CV Results | 13_cv_results.png |
| 14 | Evaluation Metrics | 14_evaluation_metrics.png |
| 15 | Confusion Matrix | 15_confusion_matrix.png |
| 16 | ROC Curve | 16_roc_curve.png |
| 17 | Precision-Recall Curve | 17_precision_recall_curve.png |
| 18 | Feature Importance | 18_feature_importance.png |
| 19 | K-Fold Analysis | 19_kfold_analysis.png |