In [3]:
# Install necessary libraries (if not installed)


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib  # For saving the trained model


In [5]:


# Read CSV with better parsing options
df = pd.read_csv(r"C:\Users\shari\Downloads\heart.csv", delimiter=",", skipinitialspace=True, dtype=str)

# If still a single column, force splitting
if df.shape[1] == 1:
    df = df.iloc[:, 0].str.split(',', expand=True)

# Assign proper column names (if missing)
expected_columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
                    "exang", "oldpeak", "slope", "ca", "thal", "target"]

if df.shape[1] == len(expected_columns):
    df.columns = expected_columns

# Convert data to numeric types (if possible)
df = df.apply(pd.to_numeric, errors='coerce')



# Display dataset info
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0    

In [6]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Encode categorical variables if any (assuming 'sex' and 'thal' might be categorical)
encoder = LabelEncoder()
df["sex"] = encoder.fit_transform(df["sex"])
df["thal"] = encoder.fit_transform(df["thal"])

# Features & target
X = df.drop(columns=["target"])  # All 13 features
y = df["target"]  # Target column

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80-20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


Missing values:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [7]:
# Initialize XGBoost classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205


Confusion Matrix:
 [[100   0]
 [  0 105]]


In [8]:
# Save the trained model
joblib.dump(xgb_model, "xgboost_heart_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [9]:
# Load the model & scaler
xgb_loaded = joblib.load("xgboost_heart_model.pkl")
scaler_loaded = joblib.load("scaler.pkl")

# Define feature names
feature_names = X.columns.tolist()

# Predict on new patient data
input_data = [[63, 1, 3, 145, 233, 1, 0, 150, 0, 2.3, 0, 0, 1]]  # Example input
input_df = pd.DataFrame(input_data, columns=feature_names)  # Convert to DataFrame

input_scaled = scaler_loaded.transform(input_df)  # Now transform
prediction = xgb_loaded.predict(input_scaled)

print("Prediction:", prediction[0])


Prediction: 1


In [10]:
test_input_1 = [[45, 0, 2, 119, 0, 0, 0, 170, 0, 1.2, 1, 0, 2]]  # Example input
test_df_1 = pd.DataFrame(test_input_1, columns=feature_names)
test_scaled_1 = scaler.transform(test_df_1)
prediction_1 = xgb_loaded.predict(test_scaled_1)
print("Prediction:", prediction_1[0])  # Check output


Prediction: 1
