In [None]:
### ML Jar  & ML FLow One Model building One time

## Preprocessing Steps

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

def preprocess_data():
    # Load data
    df = pd.read_csv('/Users/I353375/Downloads/MLOps/airflow/day.csv', 
                 dtype={'season': 'int64', 'mnth': 'int64', 'weekday': 'int64', 'weathersit': 'int64'})

    # Define mappings
    season_map = {1: "spring", 2: "summer", 3: "fall", 4: "winter"}
    weather_map = {1: 'good', 2: 'moderate', 3: 'bad', 4: 'severe'}
    month_map = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun',
             7: 'jul', 8: 'aug', 9: 'sept', 10: 'oct', 11: 'nov', 12: 'dec'}
    weekday_map = {0: 'sun', 1: 'mon', 2: 'tue', 3: 'wed', 4: 'thu', 5: 'fri', 6: 'sat'}

    # Map correctly
    df['season'] = df['season'].map(season_map)
    df['weathersit'] = df['weathersit'].map(weather_map)
    df['mnth'] = df['mnth'].map(month_map)
    df['weekday'] = df['weekday'].map(weekday_map)

    # Drop unwanted column
    df = df.drop('dteday', axis=1)

    # Get dummies (now it will work correctly)
    df_new = pd.get_dummies(data=df, columns=['weathersit', 'season', 'mnth', 'weekday'], dtype=int)
    
    # Train/test split
    df_train, df_test = train_test_split(df_new, train_size=0.7, random_state=100)
    
    # Initialize scaler and scale only once
    scaler = MinMaxScaler()
    numerical_vars = ['temp', 'atemp', 'hum', 'windspeed', 'cnt']
    
    # Transform in one operation
    df_train[numerical_vars] = scaler.fit_transform(df_train[numerical_vars])
    df_test[numerical_vars] = scaler.transform(df_test[numerical_vars])
    
    # Split features and target
    X_train = df_train.drop(columns=['cnt'])
    X_test = df_test.drop(columns=['cnt'])
    y_train = df_train['cnt']
    y_test = df_test['cnt']
    
    # Save files with compression for faster I/O
    X_train.to_csv('/Users/I353375/Downloads/MLOps/airflow/X_train.csv', index=False)
    X_test.to_csv('/Users/I353375/Downloads/MLOps/airflow/X_test.csv', index=False)
    y_train.to_csv('/Users/I353375/Downloads/MLOps/airflow/y_train.csv', index=False)
    y_test.to_csv('/Users/I353375/Downloads/MLOps/airflow/y_test.csv', index=False)
    
    print("Preprocessing completed efficiently")

if __name__ == "__main__":
    preprocess_data()


In [None]:
## Ml Jar Model buidling

import os
import shutil
import mlflow
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from supervised.automl import AutoML
from mlflow.models.signature import infer_signature

# --- Step 1: MLflow Setup ---
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("MLJAR_Validation_Tracking_Bike")
mlflow.autolog(disable=True)  # Manual logging

# Define paths
experiment_path = "/Users/I353375/Downloads/MLOps/mlflow_server/mljar/experiments"
model_path = "/Users/I353375/Downloads/MLOps/mlflow_server/mljar/models"
results_path = "/Users/I353375/Downloads/MLOps/mlflow_server/mljar/automl_results"

# Create folders if they don't exist
os.makedirs(experiment_path, exist_ok=True)
os.makedirs(model_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

# --- Step 2: Load Data ---
X_train = pd.read_csv('/Users/I353375/Downloads/MLOps/airflow/X_train.csv')
X_test = pd.read_csv('/Users/I353375/Downloads/MLOps/airflow/X_test.csv')
y_train = pd.read_csv('/Users/I353375/Downloads/MLOps/airflow/y_train.csv')
y_test = pd.read_csv('/Users/I353375/Downloads/MLOps/airflow/y_test.csv')

y_train.columns = ['cnt']
y_test.columns = ['cnt']

# --- Step 3: Train AutoML ---
automl = AutoML(
    mode="Compete",
    total_time_limit=600,  # 10 minutes
    eval_metric="rmse",
    explain_level=0,
    algorithms=["Xgboost", "LightGBM", "CatBoost", "Linear", "Decision Tree", "Extra Trees"],
    ml_task="regression",
    results_path=results_path  # 📂 Important to save artifacts!
)

automl.fit(X_train, y_train)

# --- Step 4: Predictions ---
y_pred_test = automl.predict(X_test)
y_pred_train = automl.predict(X_train)

# --- Step 5: Metrics ---
metrics = {
    "rmse_test": mean_squared_error(y_test, y_pred_test) ** 0.5,
    "mae_test": mean_absolute_error(y_test, y_pred_test),
    "mape_test": mean_absolute_percentage_error(y_test, y_pred_test),
    "r2_test": r2_score(y_test, y_pred_test),
    "rmse_train": mean_squared_error(y_train, y_pred_train) ** 0.5,
    "mae_train": mean_absolute_error(y_train, y_pred_train),
    "mape_train": mean_absolute_percentage_error(y_train, y_pred_train),
    "r2_train": r2_score(y_train, y_pred_train)
}

# --- Step 6: Define MLflow Pyfunc Wrapper ---
class MLJARWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        from supervised.automl import AutoML
        self.model = AutoML(results_path=results_path)

    def predict(self, context, model_input):
        return self.model.predict(model_input)

# --- Step 7: Save Model Folder ---
# Save the AutoML experiment directory
shutil.copytree(
    src=results_path,
    dst=model_path,
    dirs_exist_ok=True
)

# --- Step 8: Log into MLflow ---
with mlflow.start_run(run_name="MLJAR_Best_Model"):
    # Params
    mlflow.log_params({
        "mode": "Compete",
        "time_limit_secs": 300,
        "eval_metric": "rmse",
        "task": "regression",
        "framework": "MLJAR",
        "models_count": 10
    })

    # Metrics
    mlflow.log_metrics(metrics)

    # Signature
    signature = infer_signature(X_train, y_pred_train)

    # Log model
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=MLJARWrapper(),
        signature=signature,
        code_path=[],  # No custom code needed because we import AutoML from package
        registered_model_name="mljar_bike_sharing_model"
    )

print("✅ Model training complete and logged to MLflow successfully!")
