In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

def comprehensive_feature_engineering(data):
    """
    Fixed feature engineering that handles single-row predictions
    """
    df = data.copy()
    
    # Extract titles
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\\\.', expand=False)
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Rev': 'Officer', 'Dr': 'Officer', 'Col': 'Officer', 'Major': 'Officer',
        'Capt': 'Officer', 'Don': 'Royalty', 'Dona': 'Royalty', 'Sir': 'Royalty',
        'Lady': 'Royalty', 'Countess': 'Royalty', 'Jonkheer': 'Royalty'
    }
    df['Title_Grouped'] = df['Title'].map(title_mapping).fillna('Other')
    
    # Handle missing values
    age_by_title = df.groupby('Title_Grouped')['Age'].median()
    for title in age_by_title.index:
        mask = (df['Age'].isnull()) & (df['Title_Grouped'] == title)
        df.loc[mask, 'Age'] = age_by_title[title]
    
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Create features
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['Is_Alone'] = (df['Family_Size'] == 1).astype(int)
    df['Small_Family'] = (df['Family_Size'].between(2, 4)).astype(int)
    df['Large_Family'] = (df['Family_Size'] > 4).astype(int)
    df['Fare_Per_Person'] = df['Fare'] / df['Family_Size']
    
    # 🔧 FIXED: Handle fare binning for single predictions
    try:
        # Try quantile-based binning first
        df['Fare_Bin'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])
    except ValueError:
        # If that fails (single value), use manual bins based on historical data
        df['Fare_Bin'] = pd.cut(df['Fare'], 
                               bins=[0, 7.91, 14.454, 31, np.inf], 
                               labels=['Low', 'Medium', 'High', 'Very_High'])
    
    # Age bins
    df['Age_Bin'] = pd.cut(df['Age'], 
                          bins=[0, 12, 18, 35, 60, 100], 
                          labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    df['Is_Child'] = (df['Age'] < 16).astype(int)
    
    # 🔧 FIXED: Handle cabin extraction properly
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Deck'] = df['Cabin'].astype(str).str[0]
    df['Ticket_Len'] = df['Ticket'].astype(str).apply(len)
    
    # Interaction features
    df['Age_Class'] = df['Age'] * df['Pclass']
    df['Fare_Class'] = df['Fare'] / df['Pclass']
    df['Family_Fare'] = df['Family_Size'] * df['Fare']
    
    # Encoding
    df['Sex_binary'] = df['Sex'].map({'male': 0, 'female': 1})
    embarked_mapping = {'S': 0, 'Q': 1, 'C': 2}
    df['Embarked_ordinal'] = df['Embarked'].map(embarked_mapping)
    
    # One-hot encoding
    df = pd.get_dummies(df, columns=['Title_Grouped', 'Fare_Bin', 'Age_Bin', 'Deck'], prefix_sep='_')
    
    return df

# Train and save models
def train_and_save_models():
    """Train and save all three models"""
    
    print("🚢 Training Titanic Survival Prediction Models...")
    
    # Load data
    df = pd.read_csv('C:\\Users\\prana\\Desktop\\Uni\\Sem 5\\AI_Hackathon\\threeD_Titanic_Classification\\data\\Titanic-Dataset.csv')
    
    # Apply feature engineering
    df_engineered = comprehensive_feature_engineering(df)
    
    # Define features
    numeric_base_features = ['Pclass', 'Sex_binary', 'Age', 'Fare', 'Family_Size', 'Is_Alone', 
                            'Has_Cabin', 'Embarked_ordinal', 'Age_Class', 'Fare_Class', 
                            'Family_Fare', 'Is_Child', 'Small_Family', 'Large_Family', 
                            'Fare_Per_Person', 'Ticket_Len']
    
    onehot_features = [col for col in df_engineered.columns if col.startswith(
        ('Title_Grouped_', 'Fare_Bin_', 'Age_Bin_', 'Deck_'))]
    
    feature_columns = numeric_base_features + onehot_features
    
    X = df_engineered[feature_columns]
    y = df_engineered['Survived']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # 1. Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    rf_data = {
        'model': rf_model,
        'feature_columns': feature_columns
    }
    
    with open('titanic_rf_model.pkl', 'wb') as f:
        pickle.dump(rf_data, f)
    
    # 2. Logistic Regression (with scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_model.fit(X_train_scaled, y_train)
    
    lr_data = {
        'model': lr_model,
        'scaler': scaler,
        'feature_columns': feature_columns
    }
    
    with open('titanic_lr_model.pkl', 'wb') as f:
        pickle.dump(lr_data, f)
    
    # 3. Gradient Boosting
    gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    gb_model.fit(X_train, y_train)
    
    gb_data = {
        'model': gb_model,
        'feature_columns': feature_columns
    }
    
    with open('titanic_gb_model.pkl', 'wb') as f:
        pickle.dump(gb_data, f)
    
    print(f"✅ All models trained and saved!")
    print(f"📊 Random Forest Test Accuracy: {rf_model.score(X_test, y_test):.4f}")
    print(f"📊 Logistic Regression Test Accuracy: {lr_model.score(X_test_scaled, y_test):.4f}")
    print(f"📊 Gradient Boosting Test Accuracy: {gb_model.score(X_test, y_test):.4f}")

def predict_from_form(pclass, gender, age, sibsp, parch, fare, embarked, model_type='Random Forest'):
    """
    🎯 MAIN PREDICTION FUNCTION FOR YOUR FORM
    
    Parameters (exactly matching your form):
    - pclass: 1, 2, or 3
    - gender: 'Male' or 'Female'
    - age: float
    - sibsp: int (Number of Siblings/Spouses)
    - parch: int (Number of Parents/Children)
    - fare: float (Fare Paid)
    - embarked: 'Southampton', 'Cherbourg', or 'Queenstown'
    - model_type: 'Random Forest', 'Logistic Regression', or 'Gradient Boosting'
    """
    
    try:
        # Load the selected model
        model_files = {
            'Random Forest': 'titanic_rf_model.pkl',
            'Logistic Regression': 'titanic_lr_model.pkl', 
            'Gradient Boosting': 'titanic_gb_model.pkl'
        }
        
        with open(model_files[model_type], 'rb') as f:
            model_data = pickle.load(f)
        
        # Convert form inputs
        embarked_map = {'Southampton': 'S', 'Cherbourg': 'C', 'Queenstown': 'Q'}
        embarked_code = embarked_map[embarked]
        
        # Create input dataframe
        input_data = pd.DataFrame({
            'Pclass': [pclass],
            'Sex': [gender.lower()],
            'Age': [age],
            'SibSp': [sibsp],
            'Parch': [parch],
            'Fare': [fare],
            'Embarked': [embarked_code],
            'Name': ['User, Mr. Test'],
            'Cabin': [np.nan],
            'Ticket': ['12345']
        })
        
        # Apply feature engineering
        processed_data = comprehensive_feature_engineering(input_data)
        
        # Get model components
        model = model_data['model']
        feature_columns = model_data['feature_columns']
        
        # Ensure all feature columns exist
        for col in feature_columns:
            if col not in processed_data.columns:
                processed_data[col] = 0
        
        # Select features in correct order
        X_input = processed_data[feature_columns].astype(float)
        
        # Apply scaling for Logistic Regression
        if model_type == 'Logistic Regression':
            scaler = model_data['scaler']
            X_input = scaler.transform(X_input)
        
        # Make prediction
        prediction = model.predict(X_input)[0]
        probability = model.predict_proba(X_input)[0][1]  # Probability of survival
        
        return {
            'model_used': model_type,
            'survival_prediction': 'Survived' if prediction == 1 else 'Not Survived',
            'survival_probability': round(probability, 3),
            'death_probability': round(1 - probability, 3),
            'emoji': '✅' if prediction == 1 else '☠️'
        }
        
    except Exception as e:
        return {
            'error': f"Prediction failed: {str(e)}",
            'survival_prediction': 'Error',
            'survival_probability': 0.0
        }

# Train models first (run this once)
train_and_save_models()


🚢 Training Titanic Survival Prediction Models...
✅ All models trained and saved!
📊 Random Forest Test Accuracy: 0.7821
📊 Logistic Regression Test Accuracy: 0.8045
📊 Gradient Boosting Test Accuracy: 0.7709
