<a href="https://colab.research.google.com/github/Rakesh-kumar-s/Hackathon/blob/main/Nutrition%20Health%20Survey-%20Age%20Prediction%20Summer%20Analytics%202025%2C%20IIT%20Guwahati.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ======================
# 📦 IMPORTS
# ======================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_recall_curve
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# ======================
# 🔍 EDA & PREPROCESSING
# ======================
def load_and_preprocess(data_path):
    # Load data
    df = pd.read_csv(data_path)

    # Example: Create BMI if not present
    if 'weight_kg' in df and 'height_m' in df:
        df['bmi'] = df['weight_kg'] / (df['height_m']**2)

    # Handle missing values
    imputer = KNNImputer(n_neighbors=5)
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    df[num_cols] = imputer.fit_transform(df[num_cols])

    # Scale numeric features
    scaler = RobustScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

# ======================
# 🤖 MODEL TRAINING
# ======================
def train_model(X, y, n_splits=5):
    models = []
    cv_scores = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Model (XGBoost optimized for F1)
        model = XGBClassifier(
            scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
            max_depth=4,
            subsample=0.8,
            eval_metric='logloss',
            use_label_encoder=False
        )
        model.fit(X_train, y_train)

        # Validate
        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred)
        cv_scores.append(score)
        models.append(model)

    print(f'🏆 Mean CV F1: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})')
    return models

# ======================
# 📊 THRESHOLD OPTIMIZATION
# ======================
def optimize_threshold(models, X, y):
    # Get pooled out-of-fold probabilities
    probas = np.zeros(len(X))
    for model in models:
        probas += model.predict_proba(X)[:, 1] / len(models)

    # Find threshold maximizing F1
    precision, recall, thresholds = precision_recall_curve(y, probas)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
    best_threshold = thresholds[np.argmax(f1_scores)]
    print(f'🎯 Best Threshold: {best_threshold:.4f}')
    return best_threshold

# ======================
# 📤 SUBMISSION
# ======================
def make_submission(models, test_df, threshold, file_name='submission.csv'):
    # Average predictions across models
    probas = np.zeros(len(test_df))
    for model in models:
        probas += model.predict_proba(test_df)[:, 1] / len(models)

    # Apply optimized threshold
    predictions = (probas >= threshold).astype(int)

    # Save
    pd.DataFrame({'ID': test_df.index, 'PREDICTION': predictions}).to_csv(file_name, index=False)
    print(f'✅ Submission saved to {file_name}')

# ======================
# 📌 MAIN EXECUTION
# ======================
if __name__ == '__main__':
    # Load data (replace paths)
    train_df = load_and_preprocess('train.csv')
    test_df = load_and_preprocess('test.csv')

    # Split features/target
    X = train_df.drop('target', axis=1)
    y = train_df['target']

    # Train
    models = train_model(X, y)

    # Optimize threshold (use validation set)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)
    val_models = train_model(X_train, y_train)
    threshold = optimize_threshold(val_models, X_val, y_val)

    # Predict & submit
    make_submission(models, test_df, threshold)

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [2]:
importances = model.feature_importances_
feat_names = X.columns
pd.Series(importances, index=feat_names).sort_values(ascending=False).plot(kind='bar')


NameError: name 'model' is not defined

In [3]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight='balanced')


In [4]:
ID,PREDICTION
1,0
2,1
...


NameError: name 'ID' is not defined