In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import joblib
from typing import Tuple, List, Union
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MLModel:
    def __init__(self, model_path: str = 'ml_model.pkl'):
        """Initialize the ML model with configuration."""
        self.model_path = model_path
        self.model = None
        self.preprocessor = None
        self.numeric_features = None
        self.categorical_features = None

    def identify_feature_types(self, df: pd.DataFrame) -> Tuple[List[str], List[str]]:
        """Identify numeric and categorical columns."""
        numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Remove target variable if present
        if 'Growth_Milestone' in numeric_features:
            numeric_features.remove('Growth_Milestone')
        if 'Growth_Milestone' in categorical_features:
            categorical_features.remove('Growth_Milestone')
            
        return numeric_features, categorical_features

    def create_preprocessor(self) -> ColumnTransformer:
        """Create a preprocessing pipeline for both numeric and categorical features."""
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'))
        ])

        return ColumnTransformer(transformers=[
            ('num', numeric_transformer, self.numeric_features),
            ('cat', categorical_transformer, self.categorical_features)
        ])

    def load_and_preprocess_data(self, file_path: str) -> Tuple[pd.DataFrame, pd.Series]:
        """Load and preprocess the data."""
        try:
            df = pd.read_csv(file_path, encoding="ISO-8859-1")
            logger.info(f"Data loaded successfully from {file_path}")
            
            # Identify features
            self.numeric_features, self.categorical_features = self.identify_feature_types(df)
            
            # Split features and target
            X = df.drop('Growth_Milestone', axis=1)
            y = df['Growth_Milestone']
            
            return X, y
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def train(self, file_path: str) -> None:
        """Train the model and save it."""
        try:
            # Load and preprocess data
            X, y = self.load_and_preprocess_data(file_path)
            
            # Create train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
            
            # Create preprocessing pipeline
            self.preprocessor = self.create_preprocessor()
            
            # Create and train pipeline
            self.model = Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', LogisticRegression(
                    max_iter=1000,
                    class_weight='balanced',
                    random_state=42
                ))
            ])
            
            # Fit the model
            self.model.fit(X_train, y_train)
            
            # Evaluate model
            train_score = self.model.score(X_train, y_train)
            test_score = self.model.score(X_test, y_test)
            cv_scores = cross_val_score(self.model, X, y, cv=5)
            
            logger.info(f"Train accuracy: {train_score:.4f}")
            logger.info(f"Test accuracy: {test_score:.4f}")
            logger.info(f"Cross-validation scores: mean={cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
            
            # Save the model
            joblib.dump(self.model, self.model_path)
            logger.info(f"Model saved to {self.model_path}")
            
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise

    def predict(self, data_array: Union[List, np.ndarray]) -> int:
        """Make predictions using the trained model."""
        try:
            if self.model is None:
                self.model = joblib.load(self.model_path)
                logger.info("Model loaded successfully")
            
            # Convert input to DataFrame with feature names
            data_df = pd.DataFrame([data_array], columns=self.numeric_features + self.categorical_features)
            
            # Make prediction
            prediction = self.model.predict(data_df)[0]
            logger.info(f"Prediction made successfully: {prediction}")
            
            return prediction
            
        except Exception as e:
            logger.error(f"Error during prediction: {str(e)}")
            raise

Prediction: 1


