# Logistic Regression Classifier
## Dataset: Heart Disease UCI

### 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report, roc_curve
)
import warnings
warnings.filterwarnings('ignore')

### 2. Load and Preprocess Data

In [None]:
# Load Heart Disease UCI dataset
df = pd.read_csv('../heart_disease_uci.csv')

print("Original Dataset Shape:", df.shape)
df.head()

In [None]:
# Drop id and dataset columns
df = df.drop(['id', 'dataset'], axis=1)

# Handle missing values
print("Missing values before:")
print(df.isnull().sum())

df = df.dropna()
print(f"\nDataset Shape after dropping missing values: {df.shape}")

In [None]:
# Convert target to binary (0 = no disease, 1 = disease)
df['target'] = (df['num'] > 0).astype(int)
df = df.drop('num', axis=1)

print("Target Distribution:")
print(df['target'].value_counts())

In [None]:
# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
le = LabelEncoder()

for col in categorical_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col].astype(str))

print("Dataset after encoding:")
df.head()

### 3. Prepare Features and Target

In [None]:
# Prepare features and target
X = df.drop('target', axis=1)
y = df['target']

print(f"Features Shape: {X.shape}")
print(f"Target Shape: {y.shape}")
print(f"\nFeature Names: {list(X.columns)}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4. Train Logistic Regression Model

In [None]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")

### 5. Model Evaluation

In [None]:
# Predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'AUC': roc_auc_score(y_test, y_prob),
    'Precision': precision_score(y_test, y_pred, average='weighted'),
    'Recall': recall_score(y_test, y_pred, average='weighted'),
    'F1': f1_score(y_test, y_pred, average='weighted'),
    'MCC': matthews_corrcoef(y_test, y_pred)
}

print("=" * 50)
print("LOGISTIC REGRESSION - EVALUATION METRICS")
print("=" * 50)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {metrics["AUC"]:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend()
plt.grid(True)
plt.show()

### 6. Feature Importance (Coefficients)

In [None]:
# Feature coefficients
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': model.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)

print("Feature Coefficients (sorted by absolute value):")
print(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['coefficient'])
plt.xlabel('Coefficient')
plt.ylabel('Feature')
plt.title('Logistic Regression - Feature Coefficients')
plt.tight_layout()
plt.show()