In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam

In [None]:
df = pd.read_csv("cardio_train.csv", delimiter=";")
print(df.head())  # Check if data is now loading correctly


In [None]:
print(df.head())


In [None]:
# Convert age from days to years
df['age'] = (df['age'] / 365).astype(int)

In [None]:
# Checking for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
print(f"Dataset shape before cleaning: {df.shape}")
print(df.describe())  # To see min/max values


In [None]:
# Removing outliers
# Blood pressure constraints: systolic (ap_hi) > diastolic (ap_lo)
df = df[(df['ap_hi'] > df['ap_lo']) & (df['ap_hi'].between(90, 180)) & (df['ap_lo'].between(60, 110))]


In [None]:
# Height and weight constraints (relaxing extreme values)
df = df[(df['height'].between(140, 200)) & (df['weight'].between(45, 150))]


In [None]:
# Checking dataset shape after filtering
print(f"Dataset shape after cleaning: {df.shape}")


In [None]:
# Feature Engineering
df['BMI'] = df['weight'] / (df['height'] / 100) ** 2  # Body Mass Index
df['BP_diff'] = df['ap_hi'] - df['ap_lo']  # Blood Pressure Difference

In [None]:
# Dropping unnecessary features
df = df.drop(columns=['id'])  # ID is not useful for prediction


In [None]:
# Reset index before scaling
df = df.reset_index(drop=True)


In [None]:
if df.shape[0] > 0:
    scaler = StandardScaler()
    df[['height', 'weight', 'ap_hi', 'ap_lo']] = scaler.fit_transform(df[['height', 'weight', 'ap_hi', 'ap_lo']])
else:
    print("No data left after cleaning. Adjust filtering conditions.")


In [None]:
# Feature scaling
scaler = StandardScaler()
df[['height', 'weight', 'ap_hi', 'ap_lo']] = scaler.fit_transform(df[['height', 'weight', 'ap_hi', 'ap_lo']])


In [None]:
# Display processed dataset summary
print("Processed dataset summary:")
print(df.describe())

In [None]:
# Visualizing correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
# Selecting important features based on correlation
features = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
target = 'cardio'
X = df[features]
y = df[target]


In [None]:
# Prepare Features & Target
features = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
target = 'cardio'

In [None]:
X = df[features]
y = df[target]

In [None]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split # Importing train_test_split

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Build Improved Deep Learning Model
model = keras.Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),  # Dropout to reduce overfitting
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])


In [None]:
# Compile Model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train Model
history = model.fit(X_train, y_train,
                    epochs=50, batch_size=32,
                    validation_data=(X_test, y_test),
                    callbacks=[keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, factor=0.5)])

In [None]:
# Evaluate Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\n🔥 Improved Deep Learning Model Accuracy: {test_acc * 100:.2f}% 🔥")