# Script to train a single model(mainly for testing purpose)

In [None]:
# Single-Model Training Script: Train on Kaggle, Upload to Hugging Face Hub
# This script is designed to be run in a Kaggle Notebook. It trains a model
# for a single, specified city and uploads the artifacts to a Hugging Face repo.
#This is mainly for testing purpose

import os
import json
import shutil
import pandas as pd
import xgboost as xgb
from huggingface_hub import HfApi, HfFolder
from kaggle_secrets import UserSecretsClient
import logging
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

try:
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
except Exception as e:
    raise ValueError("Could not retrieve HF_TOKEN from Kaggle secrets.") from e

# --- Define the specific city and repositories ---
CITY_FILE_PATH = "w_d_1/Abbigeri.csv" 

CITY_NAME = os.path.basename(CITY_FILE_PATH).replace('.csv', '').split('_')[0].lower()

KAGGLE_DATASET_ID = "mukeshdevrath007/indian-5000-cities-weather-data"
HF_MODEL_REPO_ID = "Shreyansh1718/Weather-prediction-model"

KAGGLE_INPUT_PATH = f"/kaggle/input/{KAGGLE_DATASET_ID.split('/')[1]}"
TEMP_ARTIFACTS_DIR = f"/kaggle/working/{CITY_NAME}_artifacts"

# --- Setup Hugging Face API Login ---
logger.info("Configuring Hugging Face credentials...")
api = HfApi()
HfFolder.save_token(HF_TOKEN)
api.create_repo(repo_id=HF_MODEL_REPO_ID, repo_type="model", exist_ok=True)
logger.info(f"Credentials configured. Model repository '{HF_MODEL_REPO_ID}' is ready.")

# --- Helper Functions ---
def create_features(df):
    """Create time-based and ML features for model training."""
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.set_index('date')
    df = df.rename(columns={'tavg': 'temperature_2m', 'prcp': 'precipitation'})
    df = df.ffill()
    df['month'] = df.index.month
    df['day_of_year'] = df.index.dayofyear
    df['day_of_week'] = df.index.dayofweek
    df['year'] = df.index.year
    df['temp_lag_1'] = df['temperature_2m'].shift(1)
    df['temp_lag_2'] = df['temperature_2m'].shift(2)
    if 'precipitation' in df.columns:
        df['precip_lag_1'] = df['precipitation'].shift(1)
    df['temp_rolling_mean_7'] = df['temperature_2m'].shift(1).rolling(window=7).mean()
    return df.dropna()

# --- Main Execution ---
def main():
    
    full_data_path = os.path.join(KAGGLE_INPUT_PATH, CITY_FILE_PATH)
    
    if not os.path.exists(full_data_path):
        raise FileNotFoundError(f"The specified city file was not found at: {full_data_path}")

    try:
        logger.info(f"Reading data from '{full_data_path}'...")
        df = pd.read_csv(full_data_path)
        
        logger.info("Data loaded successfully. Starting feature engineering...")
        df_processed = create_features(df)
        
        if len(df_processed) < 100:
            raise ValueError("Insufficient data after processing.")

        y = df_processed['temperature_2m']
        X = df_processed.drop('temperature_2m', axis=1)
        X_numeric = X.select_dtypes(include=['number'])

        X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, shuffle=False)
        
        logger.info(f"Training model for {CITY_NAME} on {len(X_train)} samples...")
        model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42, verbosity=0)
        model.fit(X_train, y_train)
        
        mae = mean_absolute_error(y_test, model.predict(X_test))
        logger.info(f"Model trained. Validation MAE: {mae:.2f}°C")
        
        os.makedirs(TEMP_ARTIFACTS_DIR, exist_ok=True)
        model.save_model(os.path.join(TEMP_ARTIFACTS_DIR, "model.json"))
        with open(os.path.join(TEMP_ARTIFACTS_DIR, "metadata.json"), 'w') as f:
            json.dump({'validation_mae': mae, 'city_name': CITY_NAME, 'features': list(X_numeric.columns)}, f)
        
        logger.info(f"Uploading artifacts to '{HF_MODEL_REPO_ID}' under path '{CITY_NAME}'...")
        api.upload_folder(
            folder_path=TEMP_ARTIFACTS_DIR,
            path_in_repo=CITY_NAME,
            repo_id=HF_MODEL_REPO_ID,
            repo_type="model"
        )
        
        logger.info(f"\n✅ Successfully trained and uploaded model for {CITY_NAME}.")

    except Exception as e:
        logger.error(f"❌ Pipeline failed for {CITY_NAME}. Error: {e}", exc_info=True)
    finally:
        if os.path.exists(TEMP_ARTIFACTS_DIR):
            shutil.rmtree(TEMP_ARTIFACTS_DIR)
        logger.info("Cleanup complete.")

if __name__ == "__main__":
    main()

# Script to upload trained models(large size)

In [None]:
from huggingface_hub import HfApi
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

api = HfApi(token=HF_TOKEN)
api.upload_large_folder(
    folder_path="/kaggle/working/upload_staging",
    repo_id="Shreyansh1718/Weather-prediction-model",
    repo_type="model",
)


# Script to check number of models in model hub

In [7]:
import os
from huggingface_hub import HfApi
import logging
from kaggle_secrets import UserSecretsClient

# --- Configuration ---
# Setup logging to show the script's progress
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# The Hugging Face Model Hub repository you want to check
HF_MODEL_REPO_ID = "Shreyansh1718/Weather-prediction-model"

# Get your Hugging Face token from an environment variable. A 'read' token is sufficient.
try:
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
except Exception as e:
    raise ValueError("Could not retrieve HF_TOKEN from Kaggle secrets. Please ensure it is set.") from e

# --- Main Verification Logic ---
def main():
    """
    Connects to a Hugging Face Model Hub repository and lists all the
    subdirectories (trained models) it contains.
    """
    logger.info(f"Connecting to repository: {HF_MODEL_REPO_ID}...")

    try:
        # Instantiate the HfApi client with your token
        api = HfApi(token=HF_TOKEN)

        # Get a list of all files and folders in the repository
        repo_contents = api.list_repo_tree(repo_id=HF_MODEL_REPO_ID, repo_type="model")
        
        # Identify directories by checking for items that do NOT have a 'size' attribute
        model_folders = sorted([item.path for item in repo_contents if not hasattr(item, 'size')])

        logger.info("\n" + "="*50)
        logger.info(f"✅ Verification Complete")
        logger.info(f"Found {len(model_folders)} model folders in the repository.")
        logger.info("="*50)

        if model_folders:
            logger.info("Sample of models found (first 20):")
            for i, folder_name in enumerate(model_folders[:20]):
                logger.info(f"  {i+1}. {folder_name}")
        else:
            logger.warning("No model folders were found in the repository.")

    except Exception as e:
        logger.error(f"❌ An error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()

# Script to generate training report

In [None]:
"""Verify trained models against dataset CSVs.

This script is designed to run after the training pipeline on Kaggle. It:
- Scans the Kaggle input dataset for city CSV files
- Lists top-level uploaded model folders in the specified HF model repository
- Produces a `training_report.json` in the upload staging directory with lists:
  - `dataset_cities`: all city names found in data files
  - `uploaded_models`: model names found in HF repo
  - `missing_models`: cities present in dataset but not in repo (need training)
  - `extra_models`: models in repo that don't match dataset cities
"""

import os
import json
import logging
from typing import List, Set
from kaggle_secrets import UserSecretsClient
from huggingface_hub import HfApi

logger = logging.getLogger("verify_trained_models")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
logger.addHandler(handler)


user_secrets = UserSecretsClient()
KAGGLE_DATASET_ID = os.getenv("KAGGLE_DATASET_ID", "mukeshdevrath007/indian-5000-cities-weather-data")
KAGGLE_INPUT_PATH = os.getenv("KAGGLE_INPUT_PATH", f"/kaggle/input/{KAGGLE_DATASET_ID.split('/')[1]}")
UPLOAD_STAGING_DIR = os.getenv("UPLOAD_STAGING_DIR", "/kaggle/working/upload_staging")
HF_MODEL_REPO_ID = os.getenv("HF_MODEL_REPO_ID", "Shreyansh1718/Weather-prediction-model")
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")


def discover_dataset_city_names(input_root: str, subfolders: List[str]) -> Set[str]:
    cities = set()
    for sub in subfolders:
        full = os.path.join(input_root, sub)
        if not os.path.isdir(full):
            continue
        for fname in os.listdir(full):
            if fname.lower().endswith('.csv'):
                name = fname.replace('.csv', '')
                city = name.split('_')[0].split('-')[0].lower()
                cities.add(city)
    return cities


def list_repo_top_level(api: HfApi, repo_id: str) -> Set[str]:
    try:
        tree = api.list_repo_tree(repo_id=repo_id, repo_type='model')
        top = set(item.path.split('/')[0].lower() for item in tree)
        return top
    except Exception as e:
        logger.error(f"Could not list repo tree: {e}")
        return set()


if __name__ == '__main__':
    if not HF_TOKEN:
        logger.warning('HF_TOKEN not provided; API calls may be rate-limited or unauthenticated')

    api = HfApi(token=HF_TOKEN) if HF_TOKEN else HfApi()

    candidate_subfolders = ['.', 'w_d_1', 'w_d_2', 'w_d_3/w_d_3', 'dn']
    dataset_cities = discover_dataset_city_names(KAGGLE_INPUT_PATH, candidate_subfolders)
    logger.info(f"Found {len(dataset_cities)} cities in dataset")

    uploaded_models = list_repo_top_level(api, HF_MODEL_REPO_ID)
    logger.info(f"Found {len(uploaded_models)} top-level items in HF repo")

    missing = sorted(list(dataset_cities - uploaded_models))
    extra = sorted(list(uploaded_models - dataset_cities))

    os.makedirs(UPLOAD_STAGING_DIR, exist_ok=True)
    report_path = os.path.join(UPLOAD_STAGING_DIR, 'training_report.json')
    report = {
        'dataset_cities': sorted(list(dataset_cities)),
        'uploaded_models': sorted(list(uploaded_models)),
        'missing_models': missing,
        'extra_models': extra,
    }

    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)

    logger.info(f"Wrote training report to {report_path}")


Found 4408 cities in dataset
Found 4408 cities in dataset
Found 4596 top-level items in HF repo
Found 4596 top-level items in HF repo
Wrote training report to /kaggle/working/upload_staging/training_report.json
Wrote training report to /kaggle/working/upload_staging/training_report.json


# Script to check uploaded models and train remaining models and upload them

In [None]:
"""Automated Multi-City Training Pipeline

This script discovers city CSVs in the Kaggle dataset input path, trains a separate XGBoost model per city,
and uploads trained artifacts to a Hugging Face model repo. It fixes the bug where previously-detected
models in the repo were not correctly compared against city names, which caused retraining or skipping errors.

Designed to run in a Kaggle environment but also runnable locally if HF_TOKEN is provided as an env var.
"""

import os
import json
import shutil
import logging
from typing import List, Set

import pandas as pd
import xgboost as xgb
from huggingface_hub import HfApi
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from kaggle_secrets import UserSecretsClient

# --- TQDM-friendly logging handler ---
class TqdmLoggingHandler(logging.Handler):
    def emit(self, record):
        try:
            msg = self.format(record)
            tqdm.write(msg)
            self.flush()
        except Exception:
            self.handleError(record)

logger = logging.getLogger("train_and_upload_pipeline")
logger.setLevel(logging.INFO)
if logger.hasHandlers():
    logger.handlers.clear()
logger.addHandler(TqdmLoggingHandler())

# --- Config ---
TARGET_COLUMN = "temperature_2m"
MAX_CITIES_TO_PROCESS = 5000


user_secrets = UserSecretsClient()
KAGGLE_DATASET_ID = os.getenv("KAGGLE_DATASET_ID", "mukeshdevrath007/indian-5000-cities-weather-data")
KAGGLE_INPUT_PATH = os.getenv("KAGGLE_INPUT_PATH", f"/kaggle/input/{KAGGLE_DATASET_ID.split('/')[1]}")
UPLOAD_STAGING_DIR = os.getenv("UPLOAD_STAGING_DIR", "/kaggle/working/upload_staging")

HF_MODEL_REPO_ID = os.getenv("HF_MODEL_REPO_ID", "Shreyansh1718/Weather-prediction-model")
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

# --- Helper functions ---

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.set_index("date")
    df = df.rename(columns={"tavg": "temperature_2m", "prcp": "precipitation"})
    df = df.ffill()
    df["month"] = df.index.month
    df["day_of_year"] = df.index.dayofyear
    df["day_of_week"] = df.index.dayofweek
    df["year"] = df.index.year
    df["temp_lag_1"] = df["temperature_2m"].shift(1)
    df["temp_lag_2"] = df["temperature_2m"].shift(2)
    if "precipitation" in df.columns:
        df["precip_lag_1"] = df["precipitation"].shift(1)
    df["temp_rolling_mean_7"] = df["temperature_2m"].shift(1).rolling(window=7).mean()
    return df.dropna()


def list_csv_files_in_input(root: str, subfolders: List[str]) -> List[str]:
    """Search given subfolders (relative to root) and return relative CSV file paths found."""
    results = []
    for sub in subfolders:
        full = os.path.join(root, sub)
        if not os.path.isdir(full):
            logger.warning(f"Data subfolder not found: {full}")
            continue
        for fname in os.listdir(full):
            if fname.lower().endswith(".csv"):
                results.append(os.path.join(sub, fname))
    return sorted(results)


def normalize_city_name_from_path(path: str) -> str:
    """Derive a canonical city id/name from a filename like 'Jaipur_daily_...csv' -> 'jaipur'."""
    base = os.path.basename(path)
    name = base.replace(".csv", "")
    city = name.split("_")[0].split("-")[0].lower()
    return city


def get_existing_repo_models(api: HfApi, repo_id: str) -> Set[str]:
    """Return a set of model names already present in the HF model repo.

    We expect models uploaded under top-level folders with the city name. If repo is empty
    or the HF API call fails, return an empty set.
    """
    try:
        tree = api.list_repo_tree(repo_id=repo_id, repo_type="model")
        top_level = set()
        for item in tree:
            parts = item.path.split("/")
            top_level.add(parts[0].lower())
        return top_level
    except Exception as e:
        logger.warning(f"Could not list repo tree: {e}")
        return set()


def train_and_save_city_model(city_name: str, city_df: pd.DataFrame) -> bool:
    try:
        df_processed = create_features(city_df)
        if len(df_processed) < 100:
            logger.warning(f"Insufficient data for {city_name}: {len(df_processed)} rows")
            return False

        y = df_processed[TARGET_COLUMN]
        X = df_processed.drop(TARGET_COLUMN, axis=1)
        X_numeric = X.select_dtypes(include=["number"]).fillna(0)

        X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, shuffle=False)

        model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42, verbosity=0)
        model.fit(X_train, y_train)

        mae = mean_absolute_error(y_test, model.predict(X_test))
        logger.info(f"Model trained for {city_name}. Validation MAE: {mae:.2f}°C")

        city_artifact_path = os.path.join(UPLOAD_STAGING_DIR, city_name)
        os.makedirs(city_artifact_path, exist_ok=True)

        model.save_model(os.path.join(city_artifact_path, "model.json"))
        with open(os.path.join(city_artifact_path, "metadata.json"), "w") as f:
            json.dump({"validation_mae": mae, "city_name": city_name, "features": list(X_numeric.columns)}, f)

        return True
    except Exception as e:
        logger.error(f"Failed to train {city_name}: {e}")
        return False


# --- Main ---
if __name__ == "__main__":
    # locate HF token if running on Kaggle
    if not HF_TOKEN:
        try:
            # Kaggle secrets are only available in Kaggle kernels; attempt to import locally if present
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
        except Exception:
            pass

    if not HF_TOKEN:
        logger.warning("HF_TOKEN not found in env or Kaggle secrets. The script will still train locally but cannot upload.")

    api = HfApi(token=HF_TOKEN) if HF_TOKEN else HfApi()

    # prepare staging dir
    if os.path.exists(UPLOAD_STAGING_DIR):
        shutil.rmtree(UPLOAD_STAGING_DIR)
    os.makedirs(UPLOAD_STAGING_DIR, exist_ok=True)

    # determine existing models in repo
    existing_models = get_existing_repo_models(api, HF_MODEL_REPO_ID)
    logger.info(f"Found {len(existing_models)} top-level entries in repo (assumed existing city models)")

    # Load a training report that lists missing models to train.
    training_report_path = '/kaggle/working/upload_staging/training_report.json'
    city_items = []  

    if os.path.exists(training_report_path):
        try:
            logger.info(f"Loading training report from {training_report_path}")
            with open(training_report_path, 'r') as f:
                report = json.load(f)
            missing = set([c.lower() for c in report.get('missing_models', [])])

            # discover CSVs and filter by missing list
            candidate_subfolders = ['.', 'w_d_1', 'w_d_2', 'w_d_3/w_d_3', 'dn']
            csv_paths = list_csv_files_in_input(KAGGLE_INPUT_PATH, candidate_subfolders)
            logger.info(f"Discovered {len(csv_paths)} CSV files in input.")

            for rel in csv_paths:
                city = normalize_city_name_from_path(rel)
                if city in missing:
                    city_items.append((city, rel))

            if not city_items:
                logger.info("Training report present but no matching CSVs found for missing models. Falling back to discovery of all new cities.")
        except Exception as e:
            logger.warning(f"Failed to load or parse training report ({training_report_path}): {e}. Falling back to discovery.")

    # Fallback
    if not city_items:
        candidate_subfolders = ['.', 'w_d_1', 'w_d_2', 'w_d_3/w_d_3', 'dn']
        csv_paths = list_csv_files_in_input(KAGGLE_INPUT_PATH, candidate_subfolders)
        all_city_filepaths = csv_paths
        logger.info(f"Discovered {len(all_city_filepaths)} CSV files in input.")

        for rel in all_city_filepaths:
            city = normalize_city_name_from_path(rel)
            if city in existing_models:
                logger.info(f"Skipping {city} - already present in repo")
                continue
            city_items.append((city, rel))

    files_to_run = city_items[:MAX_CITIES_TO_PROCESS]
    logger.info(f"{len(files_to_run)} cities scheduled for training in this run.")

    successful = 0
    for city_name, rel_path in tqdm(files_to_run, desc="Training"):
        full_path = os.path.join(KAGGLE_INPUT_PATH, rel_path)
        try:
            df = pd.read_csv(full_path)
            if train_and_save_city_model(city_name, df):
                successful += 1
        except Exception as e:
            logger.error(f"Error processing {city_name} from {full_path}: {e}")

    if successful > 0 and HF_TOKEN:
        logger.info(f"Uploading {successful} trained city artifacts to {HF_MODEL_REPO_ID}...")
        try:
            api.upload_large_folder(folder_path=UPLOAD_STAGING_DIR, repo_id=HF_MODEL_REPO_ID, repo_type="model")
            logger.info("Upload complete.")
        except Exception as e:
            logger.error(f"Upload failed: {e}")
    elif successful > 0:
        logger.warning("Trained models present locally in staging directory but HF_TOKEN missing, skipping upload.")

    logger.info(f"Done. {successful}/{len(files_to_run)} trained successfully.")
