<a href="https://colab.research.google.com/github/Syedshahaba/Syedshahaba/blob/main/Age%20prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

#=============================================================================

# FIXED NUTRITION AGE PREDICTION MODEL - AI PLANET SUBMISSION

#=============================================================================

class NutritionAgePredictor:
    def __init__(self):
        self.models = {}
        self.scaler = StandardScaler()
        self.base_feature_columns = [
            'calories', 'protein', 'carbs', 'fat', 'fiber',
            'sugar', 'exercise_hours', 'sleep_hours'
        ]
        print(f"Initialized base_feature_columns: {self.base_feature_columns}")  # Debugging
        self.feature_columns = self.base_feature_columns.copy()
        self.engineered_features = []
        self.all_feature_columns = self.base_feature_columns.copy()
        self.best_model = None

    def generate_training_data(self, n_samples=2000):
        """Generate realistic nutrition data with age correlations"""
        print(f"🔄 Generating {n_samples} training samples...")

        np.random.seed(42)  # For reproducible results
        data = []

        for i in range(n_samples):
            # Generate realistic age first (18-75)
            age = np.random.normal(42, 15)
            age = np.clip(age, 18, 75)

            # Age-based nutrition patterns (research-backed correlations)
            age_factor = (age - 18) / 57  # Normalized age 0-1

            # Calories decrease with age
            calories = np.random.normal(2200 - age_factor * 400, 300)
            calories = np.clip(calories, 1200, 3500)

            # Protein needs vary by age and activity
            protein = np.random.normal(80 + (age_factor * 20), 25)
            protein = np.clip(protein, 30, 200)

            # Carbs - younger people eat more
            carbs = np.random.normal(250 - age_factor * 50, 60)
            carbs = np.clip(carbs, 100, 400)

            # Fat intake
            fat = np.random.normal(70 + age_factor * 10, 20)
            fat = np.clip(fat, 20, 150)

            # Fiber - older people more conscious
            fiber = np.random.normal(15 + age_factor * 15, 8)
            fiber = np.clip(fiber, 5, 50)

            # Sugar - decreases with age
            sugar = np.random.normal(60 - age_factor * 20, 25)
            sugar = np.clip(sugar, 10, 120)

            # Exercise - decreases with age
            exercise = np.random.normal(4 - age_factor * 2, 2)
            exercise = np.clip(exercise, 0, 10)

            # Sleep - varies by age
            sleep = np.random.normal(7.5 + (age_factor - 0.5) * 0.5, 1)
            sleep = np.clip(sleep, 4, 12)

            data.append({
                'calories': round(calories),
                'protein': round(protein),
                'carbs': round(carbs),
                'fat': round(fat),
                'fiber': round(fiber),
                'sugar': round(sugar),
                'exercise_hours': round(exercise, 1),
                'sleep_hours': round(sleep, 1),
                'age': round(age, 1)
            })

        return pd.DataFrame(data)

    def create_engineered_features(self, df):
        """Create additional features that might improve prediction"""
        df_enhanced = df.copy()

        # Nutritional ratios (avoid division by zero)
        df_enhanced['protein_ratio'] = (df_enhanced['protein'] * 4) / np.maximum(df_enhanced['calories'], 1) * 100
        df_enhanced['carb_ratio'] = (df_enhanced['carbs'] * 4) / np.maximum(df_enhanced['calories'], 1) * 100
        df_enhanced['fat_ratio'] = (df_enhanced['fat'] * 9) / np.maximum(df_enhanced['calories'], 1) * 100

        # Lifestyle balance features
        df_enhanced['exercise_sleep_ratio'] = df_enhanced['exercise_hours'] / np.maximum(df_enhanced['sleep_hours'], 1)
        df_enhanced['fiber_per_1000_cal'] = df_enhanced['fiber'] / (np.maximum(df_enhanced['calories'], 1) / 1000)
        df_enhanced['sugar_per_1000_cal'] = df_enhanced['sugar'] / (np.maximum(df_enhanced['calories'], 1) / 1000)

        # Health indicators
        df_enhanced['calorie_density'] = df_enhanced['calories'] / 2000  # Normalized to 2000 cal baseline
        df_enhanced['activity_level'] = df_enhanced['exercise_hours'] * df_enhanced['sleep_hours'] / 35  # 5h exercise * 7h sleep

        # Update engineered feature names
        self.engineered_features = [
            'protein_ratio', 'carb_ratio', 'fat_ratio', 'exercise_sleep_ratio',
            'fiber_per_1000_cal', 'sugar_per_1000_cal', 'calorie_density', 'activity_level'
        ]

        # Update all feature columns
        self.all_feature_columns = self.base_feature_columns + self.engineered_features
        self.feature_columns = self.all_feature_columns.copy()

        return df_enhanced

    def train_models(self, df, use_feature_engineering=True):
        """Train multiple models and select the best"""
        print("🤖 Training multiple ML models...")

        if use_feature_engineering:
            df_enhanced = self.create_engineered_features(df)
            feature_cols = self.all_feature_columns
            print(f"📈 Using {len(feature_cols)} features (including {len(self.engineered_features)} engineered)")
        else:
            df_enhanced = df.copy()
            feature_cols = self.base_feature_columns
            self.all_feature_columns = self.base_feature_columns.copy()
            self.feature_columns = self.base_feature_columns.copy()
            self.engineered_features = []
            print(f"📊 Using {len(feature_cols)} base features")

        X = df_enhanced[feature_cols]
        y = df_enhanced['age']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Define models
        models_to_train = {
            'RandomForest': RandomForestRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=5,
                random_state=42,
                n_jobs=-1
            ),
            'GradientBoosting': GradientBoostingRegressor(
                n_estimators=150,
                learning_rate=0.1,
                max_depth=8,
                random_state=42
            ),
            'Ridge': Ridge(alpha=1.0),
            'LinearRegression': LinearRegression()
        }

        best_score = float('inf')
        best_model_name = None

        print("\n📊 Model Performance:")
        print("-" * 60)

        for name, model in models_to_train.items():
            try:
                # Choose appropriate data based on model type
                if name in ['Ridge', 'LinearRegression']:
                    X_train_model = X_train_scaled
                    X_test_model = X_test_scaled
                else:
                    X_train_model = X_train
                    X_test_model = X_test

                # Train model
                model.fit(X_train_model, y_train)

                # Make predictions
                y_pred = model.predict(X_test_model)

                # Calculate metrics
                mae = mean_absolute_error(y_test, y_pred)
                rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                r2 = r2_score(y_test, y_pred)

                # Cross-validation
                cv_scores = cross_val_score(model, X_train_model, y_train,
                                          cv=5, scoring='neg_mean_absolute_error')
                cv_mae = -cv_scores.mean()

                # Store model
                self.models[name] = model

                print(f"{name:15s} | MAE: {mae:5.2f} | RMSE: {rmse:5.2f} | R²: {r2:5.3f} | CV MAE: {cv_mae:5.2f}")

                # Track best model
                if mae < best_score:
                    best_score = mae
                    best_model_name = name

            except Exception as e:
                print(f"❌ Error training {name}: {str(e)}")
                continue

        # Set best model
        if best_model_name:
            print(f"\n🏆 Best model: {best_model_name} (MAE: {best_score:.2f})")
            self.best_model = best_model_name
        else:
            print("❌ No models trained successfully!")
            # Fallback to first available model
            if self.models:
                self.best_model = list(self.models.keys())[0]
            else:
                raise ValueError("No models could be trained!")

        return best_score

    def prepare_features(self, df):
        """Prepare features for prediction, ensuring all required features exist"""
        df_prepared = df.copy()

        # Ensure base features exist
        missing_cols = [col for col in self.base_feature_columns if col not in df_prepared.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        # Add engineered features if they were used during training
        if len(self.engineered_features) > 0:
            # Nutritional ratios (avoid division by zero)
            df_prepared['protein_ratio'] = (df_prepared['protein'] * 4) / np.maximum(df_prepared['calories'], 1) * 100
            df_prepared['carb_ratio'] = (df_prepared['carbs'] * 4) / np.maximum(df_prepared['calories'], 1) * 100
            df_prepared['fat_ratio'] = (df_prepared['fat'] * 9) / np.maximum(df_prepared['calories'], 1) * 100

            # Lifestyle balance features
            df_prepared['exercise_sleep_ratio'] = df_prepared['exercise_hours'] / np.maximum(df_prepared['sleep_hours'], 1)
            df_prepared['fiber_per_1000_cal'] = df_prepared['fiber'] / (np.maximum(df_prepared['calories'], 1) / 1000)
            df_prepared['sugar_per_1000_cal'] = df_prepared['sugar'] / (np.maximum(df_prepared['calories'], 1) / 1000)

            # Health indicators
            df_prepared['calorie_density'] = df_prepared['calories'] / 2000
            df_prepared['activity_level'] = df_prepared['exercise_hours'] * df_prepared['sleep_hours'] / 35

        return df_prepared[self.all_feature_columns]

    def predict(self, X):
        """Make predictions using the best model"""
        if self.best_model is None:
            raise ValueError("No model has been trained yet!")

        # Prepare features
        if isinstance(X, pd.DataFrame):
            X_prepared = self.prepare_features(X)
        else:
            X_prepared = X

        # Make predictions based on model type
        try:
            if self.best_model in ['Ridge', 'LinearRegression']:
                X_scaled = self.scaler.transform(X_prepared)
                predictions = self.models[self.best_model].predict(X_scaled)
            else:
                predictions = self.models[self.best_model].predict(X_prepared)

            # Ensure predictions are within realistic range
            return np.clip(predictions, 18, 75)
        except Exception as e:
            raise ValueError(f"Prediction failed: {str(e)}")

    def generate_submission_data(self, n_samples=1000):
        """Generate test data for submission"""
        print(f"📋 Generating {n_samples} test samples for submission...")

        np.random.seed(123)  # Different seed for test data
        data = []

        for i in range(1, n_samples + 1):
            # Generate diverse nutrition profiles
            calories = np.random.randint(1200, 3500)
            protein = np.random.randint(30, 200)
            carbs = np.random.randint(100, 400)
            fat = np.random.randint(20, 150)
            fiber = np.random.randint(5, 50)
            sugar = np.random.randint(10, 120)
            exercise = round(np.random.uniform(0, 10), 1)
            sleep = round(np.random.uniform(4, 12), 1)

            data.append({
                'id': i,
                'calories': calories,
                'protein': protein,
                'carbs': carbs,
                'fat': fat,
                'fiber': fiber,
                'sugar': sugar,
                'exercise_hours': exercise,
                'sleep_hours': sleep
            })

        return pd.DataFrame(data)

    def get_feature_importance(self):
        """Get feature importance for tree-based models"""
        if self.best_model is None:
            print("No model has been trained yet!")
            return None

        if self.best_model in ['RandomForest', 'GradientBoosting']:
            importance = self.models[self.best_model].feature_importances_
            feature_importance_df = pd.DataFrame({
                'feature': self.all_feature_columns,
                'importance': importance
            }).sort_values('importance', ascending=False)

            return feature_importance_df
        else:
            print(f"Feature importance not available for {self.best_model}")
            return None

#=============================================================================

# MAIN EXECUTION - RUN THIS TO CREATE SUBMISSION FILE

#=============================================================================

def main():
    print("🚀 AI Planet Nutrition Age Prediction Model (FULLY FIXED)")
    print("=" * 65)
    try:
        predictor = NutritionAgePredictor()
        print("✅ Model initialized successfully")
        print(f"Debug: base_feature_columns exists: {hasattr(predictor, 'base_feature_columns')}")  # New debugging
        train_df = predictor.generate_training_data(2000)
        print(f"📊 Training data shape: {train_df.shape}")
        print("\n🎯 Training with feature engineering...")
        mae_score = predictor.train_models(train_df, use_feature_engineering=True)
        test_df = predictor.generate_submission_data(1000)
        print("\n🔮 Making predictions on test data...")
        predictions = predictor.predict(test_df)
        submission_df = pd.DataFrame({
            'id': test_df['id'],
            'predicted_age': np.round(predictions, 1)
        })
        filename = 'nutrition_age_predictions_submission.csv'
        submission_df.to_csv(filename, index=False)
        print(f"\n✅ SUCCESS! Submission file created: {filename}")
        print(f"📁 File contains {len(submission_df)} predictions")
        print(f"🎯 Model MAE on validation: {mae_score:.2f} years")
        print(f"📈 Age prediction range: {predictions.min():.1f} - {predictions.max():.1f} years")
        print("\n📋 Sample predictions:")
        print(submission_df.head(10))
        importance_df = predictor.get_feature_importance()
        if importance_df is not None:
            print("\n🔍 Top 10 Feature Importance:")
            print(importance_df.head(10))
        return submission_df
    except Exception as e:
        print(f"❌ Primary training failed: {str(e)}")
        print("🔧 Trying fallback with basic features only...")
        try:
            predictor = NutritionAgePredictor()
            train_df = predictor.generate_training_data(2000)
            mae_score = predictor.train_models(train_df, use_feature_engineering=False)
            test_df = predictor.generate_submission_data(1000)
            predictions = predictor.predict(test_df)
            submission_df = pd.DataFrame({
                'id': test_df['id'],
                'predicted_age': np.round(predictions, 1)
            })
            filename = 'nutrition_age_predictions_basic.csv'
            submission_df.to_csv(filename, index=False)
            print(f"✅ Fallback successful! Basic model submission file created: {filename}")
            return submission_df
        except Exception as e2:
            print(f"❌ Fallback also failed: {str(e2)}")
            print("🚨 Please check your environment and dependencies")
            return None

#=============================================================================

# UTILITY FUNCTIONS

#=============================================================================

def load_your_data_and_predict(csv_file_path):
    """
    If you have your own nutrition data CSV, use this function
    CSV should have columns: calories, protein, carbs, fat, fiber, sugar, exercise_hours, sleep_hours
    """
    try:
        your_data = pd.read_csv(csv_file_path)
        print(f"📁 Loaded data shape: {your_data.shape}")
        predictor = NutritionAgePredictor()
        train_df = predictor.generate_training_data(2000)
        predictor.train_models(train_df, use_feature_engineering=True)
        predictions = predictor.predict(your_data)
        if 'id' in your_data.columns:
            submission = pd.DataFrame({
                'id': your_data['id'],
                'predicted_age': np.round(predictions, 1)
            })
        else:
            submission = pd.DataFrame({
                'id': range(1, len(your_data) + 1),
                'predicted_age': np.round(predictions, 1)
            })
        submission.to_csv('your_data_predictions.csv', index=False)
        print("✅ Predictions saved to 'your_data_predictions.csv'")
        return submission
    except Exception as e:
        print(f"❌ Error processing your data: {str(e)}")
        return None

def test_model():
    """Test the model with sample data"""
    print("🧪 Testing model with sample data...")
    test_data = pd.DataFrame({
        'id': [1, 2, 3],
        'calories': [2000, 1800, 2500],
        'protein': [80, 70, 100],
        'carbs': [250, 200, 300],
        'fat': [70, 60, 80],
        'fiber': [25, 30, 20],
        'sugar': [50, 40, 60],
        'exercise_hours': [3.0, 5.0, 2.0],
        'sleep_hours': [7.5, 8.0, 7.0]
    })
    try:
        predictor = NutritionAgePredictor()
        train_df = predictor.generate_training_data(500)
        predictor.train_models(train_df, use_feature_engineering=True)
        predictions = predictor.predict(test_data)
        result = pd.DataFrame({
            'id': test_data['id'],
            'predicted_age': np.round(predictions, 1)
        })
        print("✅ Test successful!")
        print(result)
        return result
    except Exception as e:
        print(f"❌ Test failed: {str(e)}")
        return None

#=============================================================================

# RUN THE MODEL

#=============================================================================

if __name__ == "__main__":
    print("🌟 Starting Nutrition Age Prediction Model...")
    submission_df = main()
    if submission_df is not None:
        print("\n🎉 SUCCESS! Ready for AI Planet submission!")
        print("📤 Uplo")

🌟 Starting Nutrition Age Prediction Model...
🚀 AI Planet Nutrition Age Prediction Model (FULLY FIXED)
Initialized base_feature_columns: ['calories', 'protein', 'carbs', 'fat', 'fiber', 'sugar', 'exercise_hours', 'sleep_hours']
✅ Model initialized successfully
Debug: base_feature_columns exists: True
🔄 Generating 2000 training samples...
📊 Training data shape: (2000, 9)

🎯 Training with feature engineering...
🤖 Training multiple ML models...
📈 Using 16 features (including 8 engineered)

📊 Model Performance:
------------------------------------------------------------
RandomForest    | MAE:  9.75 | RMSE: 11.97 | R²: 0.283 | CV MAE:  9.61
GradientBoosting | MAE: 10.23 | RMSE: 12.54 | R²: 0.214 | CV MAE: 10.04
Ridge           | MAE:  9.48 | RMSE: 11.74 | R²: 0.311 | CV MAE:  9.25
LinearRegression | MAE:  9.49 | RMSE: 11.75 | R²: 0.310 | CV MAE:  9.26

🏆 Best model: Ridge (MAE: 9.48)
📋 Generating 1000 test samples for submission...

🔮 Making predictions on test data...

✅ SUCCESS! Submissio