# Evolver Loop 1 Analysis

Analyze data patterns and identify feature engineering opportunities based on evaluator feedback.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("\nTarget distribution:")
print(train['NObeyesdad'].value_counts(normalize=True).round(4))

In [None]:
# Basic feature engineering to explore patterns
def engineer_features(df):
    df = df.copy()
    
    # BMI - critical for obesity prediction
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    
    # Weight/Height ratio
    df['Weight_Height_Ratio'] = df['Weight'] / df['Height']
    
    # Age groups
    df['Age_Group'] = pd.cut(df['Age'], 
                            bins=[0, 18, 30, 45, 60, 100], 
                            labels=['0-18', '19-30', '31-45', '46-60', '60+'])
    
    # Simple interactions
    df['Age_Height'] = df['Age'] * df['Height']
    df['Age_Weight'] = df['Age'] * df['Weight']
    df['Height_Weight'] = df['Height'] * df['Weight']
    
    # Lifestyle combinations
    df['FCVC_NCP'] = df['FCVC'] * df['NCP']  # Frequency of consumption * Number of main meals
    df['CH2O_FAF'] = df['CH2O'] * df['FAF']  # Water consumption * Physical activity
    df['FAF_TUE'] = df['FAF'] * df['TUE']    # Physical activity * Time using tech
    
    return df

train_fe = engineer_features(train)

# Analyze BMI distribution by target
print("BMI statistics by target class:")
bmi_stats = train_fe.groupby('NObeyesdad')['BMI'].agg(['mean', 'std', 'min', 'max', 'count']).round(2)
print(bmi_stats)

# BMI categories (WHO standards)
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    elif bmi < 35:
        return 'Obese_I'
    elif bmi < 40:
        return 'Obese_II'
    else:
        return 'Obese_III'

train_fe['BMI_Category'] = train_fe['BMI'].apply(categorize_bmi)

print("\nBMI Category distribution:")
print(train_fe['BMI_Category'].value_counts())

print("\nTarget vs BMI Category cross-tab:")
bmi_target = pd.crosstab(train_fe['BMI_Category'], train_fe['NObeyesdad'], normalize='index').round(3)
print(bmi_target)

In [None]:
# Analyze feature correlations with target
from sklearn.preprocessing import LabelEncoder

# Encode target for correlation analysis
le_target = LabelEncoder()
target_encoded = le_target.fit_transform(train_fe['NObeyesdad'])

# Select numerical features for correlation
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 
                     'BMI', 'Weight_Height_Ratio', 'Age_Height', 'Age_Weight', 'Height_Weight',
                     'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE']

correlations = {}
for feature in numerical_features:
    corr = np.corrcoef(train_fe[feature], target_encoded)[0, 1]
    correlations[feature] = abs(corr)

# Sort by absolute correlation
corr_series = pd.Series(correlations).sort_values(ascending=False)
print("Feature correlations with target (absolute):")
print(corr_series.round(4))

In [None]:
# Analyze categorical features
from scipy.stats import chi2_contingency

categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS', 'Age_Group']

chi2_results = {}
for feature in categorical_features:
    contingency = pd.crosstab(train_fe[feature], train_fe['NObeyesdad'])
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    chi2_results[feature] = {'chi2': chi2, 'p_value': p_value}
    print(f"{feature}: chi2={chi2:.2f}, p_value={p_value:.4f}")

# Show most predictive categorical patterns
print("\n=== family_history_with_overweight vs Target ===")
print(pd.crosstab(train_fe['family_history_with_overweight'], train_fe['NObeyesdad'], normalize='index').round(3))

print("\n=== CAEC (Consumption of high caloric food) vs Target ===")
print(pd.crosstab(train_fe['CAEC'], train_fe['NObeyesdad'], normalize='index').round(3))

print("\n=== MTRANS (Transportation) vs Target ===")
print(pd.crosstab(train_fe['MTRANS'], train_fe['NObeyesdad'], normalize='index').round(3))

In [None]:
# Analyze interactions between key features
# BMI is most important - let's see interactions with lifestyle factors

print("=== BMI Category vs Family History ===")
print(pd.crosstab([train_fe['BMI_Category'], train_fe['family_history_with_overweight']], 
                 train_fe['NObeyesdad'], normalize='index').round(3))

print("\n=== BMI Category vs CAEC ===")
print(pd.crosstab([train_fe['BMI_Category'], train_fe['CAEC']], 
                 train_fe['NObeyesdad'], normalize='index').round(3))

# Age group interactions
print("\n=== Age Group vs Family History ===")
print(pd.crosstab([train_fe['Age_Group'], train_fe['family_history_with_overweight']], 
                 train_fe['NObeyesdad'], normalize='index').round(3))

In [None]:
# Identify high-value feature engineering opportunities
print("=== KEY FINDINGS FOR FEATURE ENGINEERING ===\n")

print("1. BMI is the most important feature (correlation: {:.4f})".format(corr_series['BMI']))
print("   - WHO categories (Underweight, Normal, Overweight, Obese_I, Obese_II, Obese_III) are highly predictive")
print("   - BMI alone can distinguish many obesity classes\n")

print("2. Weight_Height_Ratio is second most important (correlation: {:.4f})".format(corr_series['Weight_Height_Ratio']))
print("   - Alternative to BMI, captures similar signal\n")

print("3. Family history is highly predictive (p_value: {:.4f})".format(chi2_results['family_history_with_overweight']['p_value']))
print("   - Strong interaction with BMI categories\n")

print("4. CAEC (high caloric food consumption) is significant (p_value: {:.4f})".format(chi2_results['CAEC']['p_value']))
print("   - Interacts with BMI and age\n")

print("5. Transportation mode (MTRANS) is significant (p_value: {:.4f})".format(chi2_results['MTRANS']['p_value']))
print("   - Public/Automobile vs Walking/Bike shows different patterns\n")

print("6. Lifestyle interactions show promise:")
print("   - FCVC_NCP (food consumption * meals) correlation: {:.4f}".format(corr_series['FCVC_NCP']))
print("   - CH2O_FAF (water * activity) correlation: {:.4f}".format(corr_series['CH2O_FAF']))
print("   - FAF_TUE (activity * tech use) correlation: {:.4f}\n".format(corr_series['FAF_TUE']))

print("7. Age groups show different obesity patterns, especially with family history")

# Recommend specific features to engineer
print("\n=== RECOMMENDED FEATURES FOR NEXT EXPERIMENT ===")
print("1. BMI_Category (WHO standards) - categorical")
print("2. Weight_Height_Ratio - numerical")
print("3. BMI_FamilyHistory interaction")
print("4. BMI_CAEC interaction")
print("5. Age_BMI interaction")
print("6. FCVC_NCP (food consumption frequency * meals)")
print("7. CH2O_FAF (water consumption * physical activity)")
print("8. FAF_TUE (physical activity * tech use)")
print("9. FamilyHistory_AgeGroup interaction")
print("10. More granular age bins (e.g., 10-year intervals)")