In [18]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC  # Using LinearSVC instead of SVC for speed
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')

In [19]:
# Load the datasets
train_df = pd.read_csv('/content/RevisedHomesiteTrain1.csv')
test_df = pd.read_csv('/content/RevisedHomesiteTest1.csv')

In [20]:
# Separate features and target
X = train_df.drop(['QuoteConversion_Flag', 'QuoteNumber'], axis=1)
y = train_df['QuoteConversion_Flag']
test_ids = test_df['QuoteNumber']
X_test = test_df.drop('QuoteNumber', axis=1)


In [21]:
# Ensure column alignment
common_columns = list(set(X.columns) & set(X_test.columns))
X = X[common_columns]
X_test = X_test[common_columns]

print(f"Number of features: {len(common_columns)}")

Number of features: 594


In [22]:
# Split the data
print("Splitting data...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

Splitting data...
Scaling features...


In [23]:
# Feature selection with fewer features
print("Performing feature selection...")
k = 100  # Reduced number of features
selector = SelectKBest(f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_val_selected = selector.transform(X_val_scaled)
X_test_selected = selector.transform(X_test_scaled)

Performing feature selection...


In [24]:
# Convert to DataFrame with selected feature names
selected_features = X_train_scaled.columns[selector.get_support()].tolist()
X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_val_selected = pd.DataFrame(X_val_selected, columns=selected_features)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)



In [25]:
# Initialize optimized base models
base_models = {
    'mlp': MLPClassifier(hidden_layer_sizes=(50,), max_iter=100, random_state=42),
    'svm': LinearSVC(random_state=42),  # Faster than SVC
    'dt': DecisionTreeClassifier(max_depth=5, random_state=42),
    'rf': RandomForestClassifier(n_estimators=50, max_depth=5, n_jobs=-1, random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
}

In [29]:
# Function to train a single model
def train_model(name, model, X_train, y_train, X_val):
    model.fit(X_train, y_train)
    if hasattr(model, 'predict_proba'):
        pred_proba = model.predict_proba(X_val)[:, 1]
    else:
        # For LinearSVC, convert decision function to pseudo-probabilities
        decision_values = model.decision_function(X_val)
        pred_proba = 1 / (1 + np.exp(-decision_values))
    return name, model, pred_proba


In [30]:
# Function to train models with multiple SMOTE ratios
def train_with_multiple_smote_ratios(X_train, y_train, X_val, y_val, smote_ratios=[0.5, 0.75, 1.0]):
    all_results = {}
    best_score = 0
    best_ratio = None
    best_predictions = None
    best_models = None

    for ratio in smote_ratios:
        print(f"\nTraining with SMOTE ratio: {ratio}")
        smote = SMOTE(sampling_strategy=ratio, random_state=42, n_jobs=-1)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        # Train models in parallel
        results = Parallel(n_jobs=-1)(
            delayed(train_model)(name, model, X_train_resampled, y_train_resampled, X_val)
            for name, model in base_models.items()
        )

        predictions = {}
        trained_models = {}
        ratio_scores = []

        for name, model, pred_proba in results:
            predictions[name] = pred_proba
            trained_models[name] = model
            score = roc_auc_score(y_val, pred_proba)
            ratio_scores.append(score)
            print(f"{name} ROC AUC: {score:.4f}")
            # Store results for this ratio
        all_results[ratio] = {
            'predictions': predictions,
            'models': trained_models,
            'scores': ratio_scores,
            'mean_score': np.mean(ratio_scores)
        }

        # Update best results if current ratio performs better
        if all_results[ratio]['mean_score'] > best_score:
            best_score = all_results[ratio]['mean_score']
            best_ratio = ratio
            best_predictions = predictions
            best_models = trained_models

    print(f"\nResults for all SMOTE ratios:")
    for ratio in smote_ratios:
        print(f"SMOTE ratio {ratio}: Mean ROC AUC = {all_results[ratio]['mean_score']:.4f}")

    print(f"\nBest SMOTE ratio: {best_ratio} (Mean ROC AUC: {best_score:.4f})")

    return best_predictions, best_models, all_results, best_ratio

In [31]:
# Train models with different SMOTE ratios
print("Training models with different SMOTE ratios...")
smote_ratios = [0.5, 0.75, 1.0]
predictions, trained_models, all_smote_results, best_ratio = train_with_multiple_smote_ratios(
    X_train_selected, y_train,
    X_val_selected, y_val,
    smote_ratios
)

Training models with different SMOTE ratios...

Training with SMOTE ratio: 0.5
mlp ROC AUC: 0.9383
svm ROC AUC: 0.9354
dt ROC AUC: 0.9111
rf ROC AUC: 0.9013
knn ROC AUC: 0.8845

Training with SMOTE ratio: 0.75
mlp ROC AUC: 0.9354
svm ROC AUC: 0.9355
dt ROC AUC: 0.9108
rf ROC AUC: 0.9035
knn ROC AUC: 0.8802

Training with SMOTE ratio: 1.0
mlp ROC AUC: 0.9345
svm ROC AUC: 0.9355
dt ROC AUC: 0.9097
rf ROC AUC: 0.9012
knn ROC AUC: 0.8784

Results for all SMOTE ratios:
SMOTE ratio 0.5: Mean ROC AUC = 0.9141
SMOTE ratio 0.75: Mean ROC AUC = 0.9131
SMOTE ratio 1.0: Mean ROC AUC = 0.9119

Best SMOTE ratio: 0.5 (Mean ROC AUC: 0.9141)


In [39]:
# Create meta-features using predictions from best models
meta_features = np.column_stack([predictions[name] for name in base_models.keys()])

# Simple hyperparameter optimization using a smaller set of parameters
print("\nPerforming simplified hyperparameter optimization...")

# List of configurations to try
configs = [
    {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'},
    {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'},
    {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
]

# Try each configuration and keep track of the best one
best_score = 0
best_model = None
best_params = None

for params in configs:
    print(f"Trying parameters: {params}")
    meta_model = LogisticRegression(**params, max_iter=200)
    meta_model.fit(meta_features, y_val)
    score = meta_model.score(meta_features, y_val)

    if score > best_score:
        best_score = score
        best_model = meta_model
        best_params = params

print("\nBest meta-model parameters:", best_params)
print("Best meta-model score:", best_score)

# Generate test predictions using best models
print("\nGenerating final predictions...")
meta_features_test = np.zeros((X_test_selected.shape[0], len(base_models)))

# Use trained_models from the previous step instead of accessing smote_results
for i, (name, model) in enumerate(trained_models.items()):
    # Check if model has predict_proba method
    if hasattr(model, 'predict_proba'):
        meta_features_test[:, i] = model.predict_proba(X_test_selected)[:, 1]
    else:
        # For models like LinearSVC, use decision_function and convert to pseudo-probabilities
        decision_values = model.decision_function(X_test_selected)
        # Convert to probabilities using sigmoid function
        meta_features_test[:, i] = 1 / (1 + np.exp(-decision_values))

# Final predictions using best meta-model
final_predictions = best_model.predict_proba(meta_features_test)[:, 1]

# Create submission file
submission = pd.DataFrame({
    'QuoteNumber': test_ids,
    'QuoteConversion_Flag': final_predictions
})
submission.to_csv('stacked_submission.csv', index=False)

# Create final performance summary
final_performance = pd.DataFrame({
    'Model': list(base_models.keys()) + ['Stacked Model'],
    'ROC AUC Score': [roc_auc_score(y_val, predictions[name]) for name in base_models.keys()] +
                    [best_score],
    'SMOTE Ratio': [best_ratio] * (len(base_models) + 1)
})

# Save final performance summary
final_performance.to_excel('final_model_performance.xlsx', index=False)

print("\nFinal Performance Summary:")
print(final_performance)


Performing simplified hyperparameter optimization...
Trying parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Trying parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
Trying parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

Best meta-model parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best meta-model score: 0.9143846153846154

Generating final predictions...

Final Performance Summary:
           Model  ROC AUC Score  SMOTE Ratio
0            mlp       0.938334          0.5
1            svm       0.935449          0.5
2             dt       0.911051          0.5
3             rf       0.901272          0.5
4            knn       0.884477          0.5
5  Stacked Model       0.914385          0.5


In [40]:
# Generate test predictions
print("Generating test predictions...")
meta_features_test = np.zeros((X_test_selected.shape[0], len(base_models)))
for i, (name, model) in enumerate(trained_models.items()):
    if hasattr(model, 'predict_proba'):
        meta_features_test[:, i] = model.predict_proba(X_test_selected)[:, 1]
    else:
        decision_values = model.decision_function(X_test_selected)
        meta_features_test[:, i] = 1 / (1 + np.exp(-decision_values))

# Make final predictions
final_predictions = meta_model.predict_proba(meta_features_test)[:, 1]

Generating test predictions...


In [41]:
# Create submission file
print("Creating submission file...")
submission = pd.DataFrame({
    'QuoteNumber': test_ids,
    'QuoteConversion_Flag': final_predictions
})
submission.to_csv('stacked_submission.csv', index=False)

Creating submission file...


In [42]:
# Create performance summary
print("Creating performance summary...")
performance_summary = pd.DataFrame({
    'Model': list(base_models.keys()) + ['Stacked Model'],
    'ROC AUC Score': [roc_auc_score(y_val, predictions[name]) for name in base_models.keys()] +
                    [roc_auc_score(y_val, meta_model.predict_proba(meta_features)[:, 1])],
    'SMOTE Strategy': [0.75] * (len(base_models) + 1)
})

# Save performance summary
performance_summary.to_excel('model_performance.xlsx', index=False)

print("\nPerformance Summary:")
print(performance_summary)

print("\nProcess completed successfully!")

Creating performance summary...

Performance Summary:
           Model  ROC AUC Score  SMOTE Strategy
0            mlp       0.938334            0.75
1            svm       0.935449            0.75
2             dt       0.911051            0.75
3             rf       0.901272            0.75
4            knn       0.884477            0.75
5  Stacked Model       0.947056            0.75

Process completed successfully!
