In [None]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.impute import SimpleImputer

# 2. Load and examine the data
df = pd.read_csv('data.csv')

# 3. Initial EDA
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nClass Distribution:\n", df['Bankrupt?'].value_counts())
print("\nClass Distribution Percentage:\n", df['Bankrupt?'].value_counts(normalize=True))

# 4. Visualize class distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Bankrupt?')
plt.title('Class Distribution')
plt.show()

# 5. Handle class imbalance using SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

# 6. Split features and target
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

# 7. Split data before applying SMOTE (to prevent data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. Create preprocessing pipeline with SMOTE
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
])

# 9. Apply preprocessing
X_train_processed, y_train_processed = pipeline.fit_resample(X_train, y_train)

# 10. Train multiple models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# 11. Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_processed, y_train_processed, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Average CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    # Train model
    model.fit(X_train_processed, y_train_processed)
    y_pred = model.predict(X_test)
    
    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))
    
    results[name] = model.score(X_test, y_test)

# 12. Feature Importance Analysis using Random Forest
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# 13. Visualize top 20 important features
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(20), x='importance', y='feature')
plt.title('Top 20 Most Important Features')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

# 14. ROC Curves
plt.figure(figsize=(10, 8))
for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend()
plt.show()

# 15. Hyperparameter tuning for best model
# Assuming Random Forest performed best
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train_processed, y_train_processed)

print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# 16. Final model evaluation with best parameters
best_model = grid_search.best_estimator_
y_pred_final = best_model.predict(X_test)
print("\nFinal Model Classification Report:")
print(classification_report(y_test, y_pred_final))

# 17. Save important findings
print("\nKey Findings:")
print("1. Best performing model:", max(results, key=results.get))
print("\n2. Top 10 most important features:")
print(feature_importance.head(10).to_string())
print("\n3. Model comparison:")
for model, score in results.items():
    print(f"{model}: {score:.3f}")