In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('ggplot')
sns.set_palette("husl")

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('mitbih_arrhythmia_features.csv')

# 1. Data Exploration
print("\n1. DATASET EXPLORATION")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nColumn names:")
print(df.columns.tolist())

print(f"\nBasic info:")
print(df.info())

print(f"\nSummary statistics:")
print(df.describe())

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())

# Check class distribution
print(f"\nClass distribution:")
print(df['beat_type'].value_counts())
print(df['beat_type_full'].value_counts())
print(f"Label encoded values: {df['label_encoded'].value_counts()}")

# 2. Data Visualization
print("\n2. DATA VISUALIZATION")

# Plot class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='beat_type', data=df)
plt.title('Distribution of Beat Types')
plt.xlabel('Beat Type')
plt.ylabel('Count')
plt.savefig('beat_type_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# Plot correlation matrix for numerical features
numerical_features = ['amplitude_max', 'amplitude_min', 'amplitude_mean', 'amplitude_std',
                      'r_peak_amplitude', 'signal_energy', 'signal_power', 'skewness',
                      'kurtosis', 'zero_crossing_rate', 'rms', 'dominant_frequency',
                      'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff',
                      'qrs_duration']

plt.figure(figsize=(14, 12))
corr_matrix = df[numerical_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.close()

# Plot distribution of some key features by beat type
key_features = ['amplitude_max', 'amplitude_mean', 'r_peak_amplitude', 'signal_energy',
                'qrs_duration', 'spectral_centroid']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    sns.boxplot(x='beat_type', y=feature, data=df, ax=axes[i])
    axes[i].set_title(f'{feature} by Beat Type')

plt.tight_layout()
plt.savefig('feature_distribution_by_beat_type.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Data Preprocessing
print("\n3. DATA PREPROCESSING")

# Select features and target
features = ['amplitude_max', 'amplitude_min', 'amplitude_mean', 'amplitude_std',
            'amplitude_range', 'r_peak_amplitude', 'signal_energy', 'signal_power',
            'skewness', 'kurtosis', 'zero_crossing_rate', 'rms', 'dominant_frequency',
            'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff',
            'qrs_duration', 't_wave_presence']

X = df[features]
y = df['label_encoded']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Dimensionality Reduction with PCA
print("\n4. DIMENSIONALITY REDUCTION WITH PCA")

pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance Ratio')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Explained Variance')
plt.legend()
plt.grid(True)
plt.savefig('pca_explained_variance.png', dpi=300, bbox_inches='tight')
plt.close()

# Find the number of components that explain 95% of variance
n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.95)[0][0] + 1
print(f"Number of components needed to explain 95% of variance: {n_components}")

# Apply PCA with selected components
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Training set after PCA: {X_train_pca.shape}")
print(f"Testing set after PCA: {X_test_pca.shape}")

# 5. Model Training
print("\n5. MODEL TRAINING")

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_pca, y_train)

# 6. Model Evaluation
print("\n6. MODEL EVALUATION")

# Make predictions
y_pred = rf_model.predict(X_test_pca)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                            target_names=['Atrial Premature Beat', 'Normal']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Atrial Premature', 'Normal'],
            yticklabels=['Atrial Premature', 'Normal'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()

# 7. Feature Importance
print("\n7. FEATURE IMPORTANCE")

# Get feature importance from the Random Forest model
feature_importance = rf_model.feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': [f'PC{i+1}' for i in range(n_components)],
    'Importance': feature_importance
})
importance_df = importance_df.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance (Principal Components)')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

# 8. Advanced Visualization - PCA Projection
print("\n8. ADVANCED VISUALIZATION")

# Apply PCA for 2D visualization
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_train_scaled)

# Create a DataFrame for plotting
pca_df = pd.DataFrame({
    'PC1': X_pca_2d[:, 0],
    'PC2': X_pca_2d[:, 1],
    'Beat Type': y_train.map({1: 'Atrial Premature', 2: 'Normal'})
})

plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Beat Type', data=pca_df, alpha=0.7)
plt.title('PCA Projection of ECG Features')
plt.savefig('pca_projection.png', dpi=300, bbox_inches='tight')
plt.close()

print("\nAnalysis complete! Check the generated plots for insights.")