In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

print("üöÄ CARDIO PREPROCESSING (Using cardio_clean.csv)")
print("=" * 50)

üöÄ CARDIO PREPROCESSING (Using cardio_clean.csv)


In [3]:
df = pd.read_csv('cardio_clean.csv')
print(f"Clean dataset loaded: {df.shape}")
print("\nColumns:", list(df.columns))
print(f"Cardio positive: {df['cardio'].mean():.1%}")

feature_cols = ['age_years', 'gender', 'height', 'weight', 'bmi', 'ap_lo', 'ap_hi', 
                'cholesterol', 'gluc', 'smoke', 'alco', 'active']
X = df[feature_cols]
y = df['cardio']

print(f"\nFeatures shape: {X.shape}")

Clean dataset loaded: (70000, 14)

Columns: ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio', 'age_years', 'bmi']
Cardio positive: 50.0%

Features shape: (70000, 12)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ SPLIT COMPLETE")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"Train balance: {y_train.mean():.1%} (matches original)")

‚úÖ SPLIT COMPLETE
X_train: (56000, 12)
X_test: (14000, 12)
Train balance: 50.0% (matches original)


In [5]:
numeric_features = ['age_years', 'height', 'weight', 'bmi', 'ap_lo', 'ap_hi']
categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

print("üìã Feature Types:")
print(f"Numeric ({len(numeric_features)}): {numeric_features}")
print(f"Categorical ({len(categorical_features)}): {categorical_features}")

üìã Feature Types:
Numeric (6): ['age_years', 'height', 'weight', 'bmi', 'ap_lo', 'ap_hi']
Categorical (6): ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']


In [6]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
], remainder='passthrough')

# Fit and transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("‚úÖ FULL PIPELINE APPLIED")
print(f"Final X_train: {X_train_processed.shape}")
print(f"Final X_test:  {X_test_processed.shape}")

‚úÖ FULL PIPELINE APPLIED
Final X_train: (56000, 14)
Final X_test:  (14000, 14)


In [7]:
print("üîç QUALITY CHECK:")
print(f"Numeric mean:  {np.mean(X_train_processed[:, :6]):.3f} ‚úì")
print(f"Numeric std:   {np.std(X_train_processed[:, :6]):.3f} ‚úì")
print(f"No NaN values: {np.isnan(X_train_processed).sum() == 0}")
print(f"Finite values: {np.isfinite(X_train_processed).all()}")

üîç QUALITY CHECK:
Numeric mean:  -0.000 ‚úì
Numeric std:   1.000 ‚úì
No NaN values: True
Finite values: True


In [8]:
train_df = pd.DataFrame(X_train_processed)
train_df['target'] = y_train.values
train_df.to_csv('cardio_train_ml_ready.csv', index=False)

test_df = pd.DataFrame(X_test_processed)
test_df['target'] = y_test.values
test_df.to_csv('cardio_test_ml_ready.csv', index=False)

pipeline_info = pd.DataFrame({
    'numeric_features': [numeric_features],
    'categorical_features': [categorical_features],
    'train_shape': [X_train_processed.shape],
    'test_shape': [X_test_processed.shape]
})
pipeline_info.to_csv('cardio_pipeline_info.csv', index=False)

print("‚úÖ ML-READY FILES SAVED:")
print("‚Ä¢ cardio_train_ml_ready.csv")
print("‚Ä¢ cardio_test_ml_ready.csv") 
print("‚Ä¢ cardio_pipeline_info.csv")

‚úÖ ML-READY FILES SAVED:
‚Ä¢ cardio_train_ml_ready.csv
‚Ä¢ cardio_test_ml_ready.csv
‚Ä¢ cardio_pipeline_info.csv


In [9]:
print("\nüìà SAMPLE TRAIN DATA:")
print(train_df.head())
print(f"\nüéØ READY FOR ML MODELS:")
print("- LogisticRegression")
print("- RandomForestClassifier") 
print("- XGBoost")
print("- Neural Networks")
print("\nüöÄ Load with: pd.read_csv('cardio_train_ml_ready.csv')")
print("üéâ PREPROCESSING 100% COMPLETE!")


üìà SAMPLE TRAIN DATA:
          0         1         2         3         4         5    6    7    8  \
0 -0.197690 -0.284930  0.610019  0.656711 -0.086783 -0.055968  1.0  0.0  0.0   
1 -0.879726 -0.771139 -0.710377 -0.314112 -0.086783 -0.055968  0.0  0.0  0.0   
2 -0.182863  0.079728  1.443954  1.186445  0.016541  0.191632  0.0  1.0  0.0   
3  0.380558 -0.041825  0.610019  0.532441  0.016541  0.129732  0.0  0.0  0.0   
4 -0.494227 -1.014244 -1.544312 -1.006144 -0.153944 -0.179769  0.0  0.0  0.0   

     9   10   11   12   13  target  
0  0.0  0.0  0.0  0.0  0.0       1  
1  0.0  0.0  0.0  0.0  1.0       0  
2  0.0  0.0  0.0  0.0  1.0       1  
3  0.0  0.0  0.0  0.0  1.0       1  
4  0.0  0.0  0.0  0.0  0.0       0  

üéØ READY FOR ML MODELS:
- LogisticRegression
- RandomForestClassifier
- XGBoost
- Neural Networks

üöÄ Load with: pd.read_csv('cardio_train_ml_ready.csv')
üéâ PREPROCESSING 100% COMPLETE!
