In [1]:
import pandas as pd

In [4]:
df = pd.read_excel('Data_Model_IoTMLCQ_2024.xlsx')

In [21]:
df_1 = df.drop_duplicates().copy()

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
import joblib
import numpy as np

# Define features and target
features = ['Temperature (°C)', 'pH', 'Dissolved Oxygen (mg/L)', 'Turbidity (NTU)']
target = 'Disease Occurrence (Cases)'

X = df_1[features]
y = df_1[target]

# STEP 1: Temporal Split (prevents data leakage)
split_idx = int(len(df_1) * 0.8)
X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

print(f"   Training: {len(X_train)} samples | Test: {len(X_test)} samples")

# STEP 2: Feature Preprocessing
# STEP 2: Feature Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


# STEP 3: Model Configuration
model = RandomForestClassifier(
    n_estimators=50,        
    max_depth=5,         
    min_samples_split=20, 
    min_samples_leaf=10,
    max_features='sqrt',   
    bootstrap=True,      
    oob_score=True,       
    random_state=42
)

model.fit(X_train_scaled, y_train_encoded)

# STEP 4: Model Evaluation
print("\nMODEL EVALUATION")
train_score = model.score(X_train_scaled, y_train_encoded)
test_score = model.score(X_test_scaled, y_test_encoded)
overfitting_gap = train_score - test_score

print(f"   Training Accuracy: {train_score:.4f}")
print(f"   Test Accuracy:     {test_score:.4f}")
print(f"   Overfitting Gap:   {overfitting_gap:.4f}")
print(f"   OOB Score:         {model.oob_score_:.4f}")

# STEP 5: Cross-Validation with Time Series Split
print("\nTIME SERIES CROSS-VALIDATION")
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X_train_scaled, y_train_encoded, cv=tscv)
print(f"   CV Scores: {cv_scores.round(4)}")
print(f"   CV Mean:   {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# STEP 6: Feature Importance Analysis
print("\nFEATURE IMPORTANCE")
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print(importance_df.round(4))

# STEP 7: Save Production Model
print("\n7. SAVING PRODUCTION MODEL")
joblib.dump(model, 'production_model.pkl')
joblib.dump(scaler, 'production_scaler.pkl') 
joblib.dump(le, 'production_label_encoder.pkl')
print("✅ Production model components saved")



   Training: 29827 samples | Test: 7457 samples

MODEL EVALUATION
   Training Accuracy: 1.0000
   Test Accuracy:     1.0000
   Overfitting Gap:   0.0000
   OOB Score:         1.0000

TIME SERIES CROSS-VALIDATION
   CV Scores: [0.2498 0.4999 0.7499 1.     1.    ]
   CV Mean:   0.6999 (±0.2916)

FEATURE IMPORTANCE
                   Feature  Importance
2  Dissolved Oxygen (mg/L)      0.4525
1                       pH      0.4326
3          Turbidity (NTU)      0.0771
0         Temperature (°C)      0.0377

7. SAVING PRODUCTION MODEL
✅ Production model components saved


In [35]:
# Production API Endpoint
from fastapi import FastAPI
import joblib
from pydantic import BaseModel
import numpy as np

app = FastAPI(title="AquaNova Water Quality Predictor", version="1.0")

# Load production model components
try:
    model = joblib.load('production_model.pkl')
    scaler = joblib.load('production_scaler.pkl')
    le = joblib.load('production_label_encoder.pkl')
    print("✅ Production model loaded successfully")
except FileNotFoundError:
    print("❌ Model files not found. Please train the model first.")

class WaterQualityInput(BaseModel):
    temperature: float
    ph: float
    dissolved_oxygen: float
    turbidity: float
    
    class Config:
        json_schema_extra = {
            "example": {
                "temperature": 25.5,
                "ph": 7.2,
                "dissolved_oxygen": 6.8,
                "turbidity": 1.2
            }
        }

@app.post("/predict")
async def predict_disease_risk(data: WaterQualityInput):
    """
    Predict disease occurrence based on water quality parameters
    """
    try:
        # Prepare input features
        features_array = np.array([[
            data.temperature, 
            data.ph, 
            data.dissolved_oxygen, 
            data.turbidity
        ]])
        
        # Apply preprocessing
        features_scaled = scaler.transform(features_array)
        
        # Make prediction
        prediction = model.predict(features_scaled)[0]
        probabilities = model.predict_proba(features_scaled)[0]
        
        # Convert prediction back to original label
        disease_level = le.inverse_transform([prediction])[0]
        confidence = round(max(probabilities) * 100, 2)
        
        # Generate recommendation
        if disease_level == 2:
            recommendation = "High disease risk - Check oxygen levels and pH balance"
            risk_status = "HIGH"
        else:
            recommendation = "Water quality within acceptable range"
            risk_status = "LOW"
            
        return {
            "disease_level": int(disease_level),
            "risk_status": risk_status,
            "confidence": confidence,
            "recommendation": recommendation,
            "input_values": {
                "temperature": data.temperature,
                "ph": data.ph,
                "dissolved_oxygen": data.dissolved_oxygen,
                "turbidity": data.turbidity
            }
        }
        
    except Exception as e:
        return {"error": f"Prediction failed: {str(e)}"}

@app.get("/")
async def root():
    return {"message": "AquaNova Water Quality Predictor API", "status": "active"}

✅ Production model loaded successfully
