In [5]:
!pip install category_encoders
!pip install imblearn

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer # Corrected import
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder # Make sure you have this installed: pip install category_encoders
from imblearn.over_sampling import SMOTE # Make sure you have this installed: pip install imblearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve



In [7]:
df = pd.read_csv('dataset_banco.csv', sep=',')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143.0,yes,no,unknown,5,may,261.0,1,-1.0,0,unknown,no
1,44,technician,single,secondary,no,29.0,yes,no,unknown,5,may,151.0,1,-1.0,0,unknown,no
2,33,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5,may,76.0,1,-1.0,0,unknown,no
3,47,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5,may,92.0,1,-1.0,0,unknown,no
4,33,unknown,single,unknown,no,1.0,no,no,unknown,5,may,198.0,1,-1.0,0,unknown,no


In [ ]:
# @title 2. EDA (Exploratory Data Analysis) - WITH CLEAN DATA
print("--- Starting EDA with CLEANED Dataset ---")
print("🎯 NOTE: Dataset has been cleaned - marketing columns removed, data quality fixed")
print("=" * 60)

# Display basic information of CLEAN dataset
print("📊 CLEAN DATASET INFORMATION:")
display(df.info())
display(df.describe(include='all'))

# Check for missing values (should be none after cleaning)
print("\nMissing values per column (post-cleaning):")
missing_check = df.isnull().sum()
display(missing_check)

if missing_check.sum() == 0:
    print("✅ Excellent! No missing values in clean dataset")
else:
    print("⚠️ Warning: Missing values found after cleaning")

# Check target variable distribution (CORRECTED: using 'default' instead of 'y')
print("\n🎯 TARGET VARIABLE ANALYSIS (CORRECTED):")
print("Distribution of 'default' (our target for credit risk assessment):")
print(df['default'].value_counts())
print(f"Default rate: {(df['default'] == 'yes').mean()*100:.2f}%")

# Visualize distributions of numerical variables (CLEAN DATA)
print("\n📊 Visualizing CLEAN numerical variables...")
df.hist(figsize=(15, 10))
plt.suptitle('Distribution of Numerical Variables (Clean Dataset)', fontsize=16)
plt.tight_layout()
plt.show()

# Visualize distributions of categorical variables with clean data
print("\n📊 Visualizing CLEAN categorical variables vs Default Risk...")
for col in df.select_dtypes(include='object').columns:
    if col != 'default':  # Don't plot target against itself
        plt.figure(figsize=(10, 6))
        sns.countplot(data=df, x=col, hue='default')
        plt.title(f'Distribution of {col} by Default Risk (Clean Data)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

# Heatmap of correlations for numerical variables (CLEAN DATA)
print("\n🔥 Correlation analysis of CLEAN numerical variables...")
plt.figure(figsize=(12, 8))
numeric_corr = df.select_dtypes(include=np.number).corr()
sns.heatmap(numeric_corr, annot=True, cmap='coolwarm', fmt=".2f", center=0)
plt.title('Correlation Heatmap - Clean Numerical Variables')
plt.show()

# Outlier detection for balance (clean data)
print("\n📦 Outlier analysis for Balance (clean data)...")
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['balance'])
plt.title('Boxplot of Balance (Clean Dataset)')
plt.show()

print("\n" + "=" * 60)
print("--- EDA Finished with CLEANED Dataset ---")
print("🎯 Key Benefits of Clean Data:")
print("   • ✅ No irrelevant marketing variables")
print("   • ✅ Fixed data quality issues (ages, marital, education)")
print("   • ✅ No missing or empty values")
print("   • ✅ Ready for accurate machine learning modeling")
print("=" * 60)

In [ ]:
# @title 3. Preprocessing (CORRECTED - Clean Data, 'default' target, No Data Leakage)
print("\n--- Starting Preprocessing with CLEANED Dataset ---")
print("🎯 Using clean dataset with proper data flow")
from sklearn.model_selection import cross_val_score, StratifiedKFold

# CORRECTED: Use 'default' as target variable for credit risk assessment from CLEAN data
X = df.drop(['default'], axis=1)  # Remove only 'default' (target) - 'y' already removed in cleaning
y = df['default'].apply(lambda x: 1 if x == 'yes' else 0) # Encode target: 1=default, 0=no default

print(f"🎯 CORRECTED TARGET: Using 'default' for credit risk assessment")
print(f"Features from CLEAN dataset: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")
print(f"Default rate: {y.mean()*100:.2f}%")

# CRITICAL: Split data BEFORE any preprocessing to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training set class distribution: {Counter(y_train)}")
print(f"Test set class distribution: {Counter(y_test)}")

# Identify categorical and numerical features from CLEAN data
categorical_features = X_train.select_dtypes(include='object').columns
numerical_features = X_train.select_dtypes(include=np.number).columns

print(f"📊 Clean data feature types:")
print(f"   Categorical: {list(categorical_features)}")
print(f"   Numerical: {list(numerical_features)}")

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Intelligent Imputation
    ('scaler', StandardScaler()) # Scaling
])

# Determine encoding strategy based on cardinality
onehot_cols = [col for col in categorical_features if X_train[col].nunique() <= 10] # Example threshold
target_cols = [col for col in categorical_features if X_train[col].nunique() > 10]

print(f"📊 Encoding strategy for clean data:")
print(f"   One-Hot Encoding: {list(onehot_cols)}")
print(f"   Target Encoding: {list(target_cols)}")

categorical_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_cols),
        ('target', TargetEncoder(), target_cols) # Target Encoding for high cardinality
    ],
    remainder='passthrough' # Keep other columns (numerical)
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing to training and test sets separately
X_train_processed = preprocessor.fit_transform(X_train, y_train)  # Fit on training only
X_test_processed = preprocessor.transform(X_test)  # Transform test set

# Handle potential imbalance ONLY on training set (FIXED: No more data leakage)
print(f"Original training dataset shape: {Counter(y_train)}")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)
print(f"Balanced training dataset shape: {Counter(y_train_balanced)}")

print("--- Preprocessing Finished with Clean Data ---")

In [ ]:
# Export df_predict to Google Drive (CORRECTED)
df_predict = df.copy()
df_predict['default_probability'] = default_probability

# Save to Google Drive
df_predict.to_csv('/content/drive/MyDrive/df_predict.csv', index=False)
print("✅ df_predict exported to Google Drive successfully with default_probability column!")