In [5]:
import pandas as pd
from sklearn.datasets import fetch_openml
import xgboost as xgb
import joblib
import os
import warnings

# Suppress warnings for a cleaner output
warnings.simplefilter(action='ignore', category=FutureWarning)

def train_and_save_model(country_name, config):
    """
    A function to load data, preprocess, train a model, and save it.
    """
    print(f"--- Starting process for {country_name} ---")
    
    # --- 1. Load Data ---
    try:
        # Special handling for the Ames/USA dataset
        if country_name == 'USA':
            dataset = fetch_openml(name="house_prices", as_frame=True)
            df = dataset.frame
        else:
            # Use 'latin1' encoding to prevent UnicodeDecodeError
            df = pd.read_csv(config['filepath'], encoding='latin1')
        print(f"Loaded {country_name} dataset successfully.")
    except Exception as e:
        print(f"ERROR: Could not load the dataset for {country_name}. Error: {e}")
        return

    # --- 2. Preprocessing ---
    all_cols = config['features'] + [config['target']]
    for col in all_cols:
        if col not in df.columns:
            print(f"ERROR: Column '{col}' not found in the {country_name} CSV. Please check the file.")
            return
        if df[col].isnull().any():
            # Convert column to numeric if possible, coercing errors
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col].fillna(df[col].median(), inplace=True)
    
    df.dropna(subset=[config['target']], inplace=True)
    print("Preprocessing complete.")

    # --- 3. Define Features (X) and Target (y) ---
    X = df[config['features']]
    y = df[config['target']]

    # --- 4. Train Model ---
    print("Training XGBoost model...")
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    xgb_model.fit(X, y)
    print("Model training complete.")

    # --- 5. Save Model and Columns ---
    models_dir = f"../backend/models/{country_name}"
    os.makedirs(models_dir, exist_ok=True)
    
    joblib.dump(xgb_model, os.path.join(models_dir, 'xgb_house_price_model.pkl'))
    joblib.dump(list(X.columns), os.path.join(models_dir, 'model_columns.pkl'))
    
    print(f"✅ Model for {country_name} saved successfully to '{models_dir}'")
    print("-" * 30)


# --- Configuration for Each Country (India is updated for your new file) ---
country_configs = {
    'USA': {
        'filepath': None, 
        'features': ['GrLivArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GarageCars', 'FullBath'],
        'target': 'SalePrice'
    },
    'Canada': {
        'filepath': '../data/Canada/Canada_House_Prices.csv',
        'features': ['Number_Beds', 'Number_Baths', 'Population', 'Median_Family_Income'],
        'target': 'Price'
    },
    'India': {
        'filepath': '../data/India/India House Price.csv',
        'features': ['bhk', 'sqft'],
        'target': 'totalprice'
    }
}

# --- Run the process for each country ---
for country, config in country_configs.items():
    train_and_save_model(country, config)



--- Starting process for USA ---
Loaded USA dataset successfully.
Preprocessing complete.
Training XGBoost model...
Model training complete.
✅ Model for USA saved successfully to '../backend/models/USA'
------------------------------
--- Starting process for Canada ---
Loaded Canada dataset successfully.
Preprocessing complete.
Training XGBoost model...
Model training complete.
✅ Model for Canada saved successfully to '../backend/models/Canada'
------------------------------
--- Starting process for India ---
Loaded India dataset successfully.
Preprocessing complete.
Training XGBoost model...
Model training complete.
✅ Model for India saved successfully to '../backend/models/India'
------------------------------
