In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

class BankMarketingAnalysis:
    def __init__(self, file_path):
        """Initialize the analysis with the dataset path."""
        self.data = pd.read_csv(file_path, sep=';')
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.model = None
        self.feature_importance = None
        
    def perform_eda(self):
        """Conduct exploratory data analysis."""
        print("Dataset Shape:", self.data.shape)
        print("\nFeature Information:")
        print(self.data.info())
        
        print("\nClass Distribution:")
        print(self.data['y'].value_counts(normalize=True))
        
        # Create visualizations
        plt.figure(figsize=(15, 6))
        
        # Age distribution
        plt.subplot(1, 2, 1)
        sns.histplot(self.data['age'], bins=30)
        plt.title('Age Distribution')
        
        # Campaign outcome distribution
        plt.subplot(1, 2, 2)
        sns.countplot(data=self.data, x='y')
        plt.title('Campaign Outcome Distribution')
        plt.tight_layout()
        plt.show()
        
        # Correlation analysis for numeric features
        numeric_data = self.data.select_dtypes(include=['int64', 'float64'])
        plt.figure(figsize=(10, 8))
        sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix')
        plt.show()
        
    def preprocess_data(self):
        """Preprocess the data for modeling."""
        # Handle categorical variables
        categorical_columns = self.data.select_dtypes(include=['object']).columns
        label_encoders = {}
        
        for column in categorical_columns:
            label_encoders[column] = LabelEncoder()
            self.data[column] = label_encoders[column].fit_transform(self.data[column])
            
        # Split features and target
        self.X = self.data.drop('y', axis=1)
        self.y = self.data['y']
        
        # Split into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
        )
        
        # Scale features
        scaler = StandardScaler()
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_test = scaler.transform(self.X_test)
        
    def train_model(self):
        """Train the Random Forest model."""
        # Initialize and train the model
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42,
            class_weight='balanced'
        )
        self.model.fit(self.X_train, self.y_train)
        
        # Calculate feature importance
        self.feature_importance = pd.DataFrame({
            'feature': self.X.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
    def evaluate_model(self):
        """Evaluate model performance."""
        # Make predictions
        y_pred = self.model.predict(self.X_test)
        
        # Print classification report
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(self.y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(data=self.feature_importance.head(10), x='importance', y='feature')
        plt.title('Top 10 Most Important Features')
        plt.show()
        
    def get_insights(self):
        """Generate insights from the analysis."""
        insights = {
            'class_distribution': self.data['y'].value_counts(normalize=True).to_dict(),
            'top_features': self.feature_importance.head(5)['feature'].tolist(),
            'model_performance': {
                'accuracy': cross_val_score(self.model, self.X_train, self.y_train, cv=5).mean()
            }
        }
        return insights

# Run the analysis
if __name__ == "__main__":
    # Initialize analysis with the full dataset
    analysis = BankMarketingAnalysis('data/bank-additional-full.csv')
    
    # Perform analysis steps
    print("Starting Exploratory Data Analysis...")
    analysis.perform_eda()
    
    print("\nPreprocessing Data...")
    analysis.preprocess_data()
    
    print("\nTraining Model...")
    analysis.train_model()
    
    print("\nEvaluating Model Performance...")
    analysis.evaluate_model()
    
    print("\nGenerating Insights...")
    insights = analysis.get_insights()
    print("\nKey Insights:")
    print(f"- Class Distribution: {insights['class_distribution']}")
    print(f"- Top 5 Most Important Features: {insights['top_features']}")
    print(f"- Model Cross-validation Accuracy: {insights['model_performance']['accuracy']:.3f}")