# Exploratory Data Analysis - Coronary Artery Disease Patient Survival

This notebook contains a comprehensive analysis of the coronary artery disease patient dataset, including data loading, preprocessing, visualization, and statistical analysis.

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)

# Display all columns
pd.set_option('display.max_columns', None)

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
df = pd.read_csv('data/data.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Display dataset info
print("Dataset Info:")
df.info()

In [None]:
# Display basic statistics
print("Basic Statistics:")
df.describe(include='all')

## 2. Data Type Checking and Conversion

In [None]:
# Create a DataFrame with data types
dtype_df = pd.DataFrame({
    'Column': df.dtypes.index,
    'Data Type': df.dtypes.values,
    'Example Value': [df[col].iloc[0] if not df[col].empty else None for col in df.columns]
})

dtype_df

In [None]:
# Convert data types
# Make a copy of the DataFrame
df_converted = df.copy()

# List of numerical columns
numerical_cols = ['Age', 'Ejection Fraction', 'Sodium', 'Creatinine', 'Platelets', 
                   'CPK', 'Blood Pressure', 'Hemoglobin', 'Height', 'Weight']

# Convert numerical columns to float
for col in numerical_cols:
    if col in df_converted.columns:
        df_converted[col] = pd.to_numeric(df_converted[col], errors='coerce')
        print(f"Converted {col} to numeric type")

# Convert categorical columns to category type
categorical_cols = ['Gender', 'Smoke', 'Diabetes']
for col in categorical_cols:
    if col in df_converted.columns:
        df_converted[col] = df_converted[col].astype('category')
        print(f"Converted {col} to category type")

# Convert target column to integer
if 'Survive' in df_converted.columns:
    df_converted['Survive'] = df_converted['Survive'].astype(int)
    print("Converted Survive to integer type")

# Update the DataFrame
df = df_converted

# Check data types after conversion
df.dtypes

## 3. Handling Duplicate Rows

In [None]:
# Check for duplicate rows
dup_count = df.duplicated().sum()
print(f"Number of duplicate rows: {dup_count}")

# If there are duplicates, display them
if dup_count > 0:
    duplicates = df[df.duplicated(keep='first')]
    print("Examples of duplicate rows:")
    duplicates.head()

In [None]:
# Drop duplicates if any
df_no_dups = df.drop_duplicates()
print(f"Number of rows removed: {len(df) - len(df_no_dups)}")

# Update the DataFrame
df = df_no_dups

## 4. Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Create a DataFrame with missing value information
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

# Sort by percentage of missing values and show only features with missing values
missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)

# Display missing values summary
missing_df

In [None]:
# Visualize missing values
if not missing_df.empty:
    plt.figure(figsize=(12, 6))
    sns.barplot(x=missing_df.index, y='Percentage', data=missing_df)
    plt.title('Percentage of Missing Values by Feature')
    plt.xlabel('Features')
    plt.ylabel('Missing Values (%')
",
    "    plt.xticks(rotation=45)
",
    "    plt.tight_layout()
",
    "    plt.show()

## 5. Outlier Analysis

In [None]:
# Create a DataFrame to store outlier information
outlier_info = []

# For each numerical column, calculate outliers using IQR method
for col in numerical_cols:
    # Calculate Q1, Q3, and IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define outlier bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers
    outliers_lower = (df[col] < lower_bound).sum()
    outliers_upper = (df[col] > upper_bound).sum()
    total_outliers = outliers_lower + outliers_upper
    
    # Calculate percentage of outliers
    percentage = (total_outliers / len(df)) * 100
    
    # Add to outlier info
    outlier_info.append({
        'Feature': col,
        'Lower Bound': lower_bound,
        'Upper Bound': upper_bound,
        'Lower Outliers': outliers_lower,
        'Upper Outliers': outliers_upper,
        'Total Outliers': total_outliers,
        'Percentage': percentage
    })

# Create a DataFrame with outlier information
outlier_df = pd.DataFrame(outlier_info)
outlier_df

In [None]:
# Visualize outlier percentages
plt.figure(figsize=(12, 6))
sns.barplot(x='Feature', y='Percentage', data=outlier_df)
plt.title('Percentage of Outliers by Feature')
plt.xlabel('Features')
plt.ylabel('Outliers (%')
",
"plt.xticks(rotation=45)
",
"plt.tight_layout()
",
"plt.show()

## 6. Target Variable Analysis

In [None]:
# Analyze target variable distribution
target_counts = df['Survive'].value_counts()
target_percent = (target_counts / len(df)) * 100

print("Target Variable Distribution:")
print(f"Survive = 1 (Survived): {target_counts.get(1, 0)} ({target_percent.get(1, 0):.2f}%)")
print(f"Survive = 0 (Not Survived): {target_counts.get(0, 0)} ({target_percent.get(0, 0):.2f}%)")

# Visualize target distribution
plt.figure(figsize=(10, 6))
sns.countplot(x=df['Survive'])
plt.title('Target Variable Distribution (Survive)')
plt.xlabel('Survive (0 = No, 1 = Yes)')
plt.ylabel('Count')

# Add count labels
for i, count in enumerate(target_counts):
    plt.text(i, count + 50, f"{count} ({target_percent.iloc[i]:.2f}%)", ha='center')

plt.tight_layout()
plt.show()

## 7. Categorical Features Analysis

In [None]:
# Analyze categorical features
for col in categorical_cols:
    # Count values
    value_counts = df[col].value_counts()
    value_percent = (value_counts / len(df)) * 100
    
    print(f"\nDistribution of {col}:")
    for value, count in value_counts.items():
        print(f"{value}: {count} ({value_percent[value]:.2f}%)")
    
    # Visualize distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x=df[col])
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Analyze relationship with target variable
    if 'Survive' in df.columns:
        # Create cross-tabulation
        crosstab = pd.crosstab(df[col], df['Survive'])
        crosstab_percent = crosstab.div(crosstab.sum(axis=1), axis=0) * 100
        
        # Visualize relationship
        plt.figure(figsize=(10, 6))
        crosstab_percent.plot(kind='bar', stacked=True)
        plt.title(f'Relationship between {col} and Survive')
        plt.xlabel(col)
        plt.ylabel('Percentage (%')
",
        "        plt.legend(['Not Survived (0)', 'Survived (1)'])
",
        "        plt.tight_layout()
",
        "        plt.show()
",
        "        
",
        "        # Chi-square test for independence
",
        "        chi2, p, dof, expected = stats.chi2_contingency(crosstab)
",
        "        print(f"Chi-square test for {col} vs Survive: chi2={chi2:.4f}, p-value={p:.4f}")

## 8. Numerical Features Analysis

In [None]:
# Analyze numerical features
for col in numerical_cols:
    if col in df.columns and not df[col].isnull().all():
        # Calculate statistics
        stats_dict = {
            'Mean': df[col].mean(),
            'Median': df[col].median(),
            'Std Dev': df[col].std(),
            'Min': df[col].min(),
            'Max': df[col].max(),
            'Skewness': df[col].skew(),
            'Kurtosis': df[col].kurt()
        }
        
        print(f"\nStatistics for {col}:")
        for stat, value in stats_dict.items():
            print(f"{stat}: {value:.4f}")
        
        # Visualize distribution
        plt.figure(figsize=(10, 6))
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.tight_layout()
        plt.show()
        
        # Analyze relationship with target variable
        if 'Survive' in df.columns:
            # Visualize relationship
            plt.figure(figsize=(10, 6))
            sns.boxplot(x='Survive', y=col, data=df)
            plt.title(f'Relationship between {col} and Survive')
            plt.xlabel('Survive (0 = No, 1 = Yes)')
            plt.ylabel(col)
            plt.tight_layout()
            plt.show()
            
            # T-test for difference in means
            survived = df[df['Survive'] == 1][col].dropna()
            not_survived = df[df['Survive'] == 0][col].dropna()
            
            if len(survived) > 0 and len(not_survived) > 0:
                t_stat, p_value = stats.ttest_ind(survived, not_survived, equal_var=False)
                print(f"T-test for {col} vs Survive: t={t_stat:.4f}, p-value={p_value:.4f}")

## 9. Correlation Analysis

In [None]:
# Create correlation matrix
# Select valid columns
valid_cols = [col for col in numerical_cols if col in df.columns and not df[col].isnull().all()]

# Add target variable if available
if 'Survive' in df.columns:
    valid_cols.append('Survive')

# Calculate correlation matrix
correlation_matrix = df[valid_cols].corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# If target variable exists, analyze feature importance based on correlation
if 'Survive' in df.columns:
    target_corr = correlation_matrix['Survive'].drop('Survive')
    target_corr_abs = target_corr.abs().sort_values(ascending=False)
    
    # Create a DataFrame for visualization
    importance_df = pd.DataFrame({
        'Feature': target_corr_abs.index,
        'Correlation': target_corr[target_corr_abs.index],
        'Absolute Correlation': target_corr_abs.values
    })
    
    # Visualize feature importance
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Feature', y='Absolute Correlation', data=importance_df)
    plt.title('Feature Importance Based on Correlation with Survival')
    plt.xlabel('Features')
    plt.ylabel('Absolute Correlation')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 10. Summary and Recommendations

Based on our exploratory data analysis, here are the key findings and recommendations:

### Key Findings:
1. The dataset contains 15,000 records with 16 features (10 numerical, 3 categorical).
2. Missing values are present in 'Ejection Fraction' (100%) and 'Creatinine' (3.33%).
3. Several numerical features show outliers, particularly in 'Creatinine' (10.09%) and 'Creatine phosphokinase' (9.89%).
4. The target variable (Survive) shows class imbalance with 67.90% not survived and 32.10% survived.
5. The most important features correlated with survival are Weight (0.41), Creatinine (0.30), and Sodium (-0.22).

### Recommendations:
1. Handle missing values, especially in 'Ejection Fraction' which is completely missing.
2. Address outliers in numerical features, particularly in 'Creatinine' and 'Creatine phosphokinase'.
3. Consider class imbalance techniques during model training.
4. Focus on the most important features identified in the correlation analysis.
5. Try multiple classification models and compare their performance.