In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Load data
dev_data_path = 'dev_data_to_be_shared.csv'
val_data_path = 'validation_data_to_be_shared.csv'

dev_data = pd.read_csv(dev_data_path)
val_data = pd.read_csv(val_data_path)

# Quick overview of data
print("Development Data Shape:", dev_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Missing values in development data:\n", dev_data.isnull().sum())

# 1. Exploratory Data Analysis (EDA)
# Distribution of target variable
sns.countplot(x=dev_data['bad_flag'])
plt.title("Distribution of Target Variable (bad_flag)")
plt.show()

# Check correlations with target
correlations = dev_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, cmap="coolwarm", annot=False, fmt=".2f")
plt.title("Feature Correlations")
plt.show()

# Handling missing values and outliers (if any)
missing_values = dev_data.isnull().mean()
print("Missing Value Ratio per Column:\n", missing_values[missing_values > 0])

# Drop columns with too many missing values (threshold 50%)
threshold = 0.5
columns_to_drop = missing_values[missing_values > threshold].index
dev_data.drop(columns=columns_to_drop, inplace=True)

# 2. Feature Engineering
# Impute missing values
imputer = SimpleImputer(strategy='median')
X_dev = dev_data.drop(columns=['bad_flag', 'account_number'])
y_dev = dev_data['bad_flag']
X_val = val_data.drop(columns=['account_number'])

X_dev_imputed = pd.DataFrame(imputer.fit_transform(X_dev), columns=X_dev.columns)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

# Standardize features
scaler = StandardScaler()
X_dev_scaled = pd.DataFrame(scaler.fit_transform(X_dev_imputed), columns=X_dev_imputed.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_imputed), columns=X_val_imputed.columns)

# Feature Selection using SelectKBest
select_k_best = SelectKBest(f_classif, k=20)  # Adjust k as necessary
X_dev_selected = select_k_best.fit_transform(X_dev_scaled, y_dev)
selected_features = X_dev.columns[select_k_best.get_support()]
print("Selected Features:", list(selected_features))

X_val_selected = X_val_scaled[selected_features]

# 3. Model Comparison
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_dev_selected, y_dev, test_size=0.2, random_state=42, stratify=y_dev)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Evaluate models
model_scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    model_scores[name] = auc
    print(f"{name} ROC-AUC: {auc:.4f}")

# Select best model
best_model_name = max(model_scores, key=model_scores.get)
print(f"Best Model: {best_model_name} with ROC-AUC: {model_scores[best_model_name]:.4f}")

best_model = models[best_model_name]

# Predict on validation data
val_predictions = best_model.predict_proba(X_val_selected)[:, 1]

# Prepare submission
submission = pd.DataFrame({
    'account_number': val_data['account_number'],
    'predicted_probability': val_predictions
})
submission.to_csv('validation_predictions.csv', index=False)

# Save model and artifacts
import joblib
joblib.dump(best_model, f'{best_model_name}_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(imputer, 'imputer.pkl')
joblib.dump(select_k_best, 'feature_selector.pkl')

print("Validation predictions saved to 'validation_predictions.csv'.")
