In [1]:
# ====================================
# ✅ FULL WATER QUALITY MODEL TRAINING
# ====================================

# Step 1: Install dependencies
!pip install -q numpy pandas scikit-learn==1.3.2 joblib

# Step 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib, os

# Step 3: Load dataset
df = pd.read_csv("/Users/viki/Desktop/waterrrrrrrrr/balanced_dataset.csv")

print("✅ Data loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Step 4: Clean dataset
df = df.drop(columns=["Index"], errors="ignore")
df = df.dropna()

# Step 5: Encode non-numeric columns
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype("category").cat.codes

# Step 6: Define features and target
if "Target" not in df.columns:
    raise KeyError("❌ 'Target' column not found. Please check your CSV column names.")

X = df.drop(columns=["Target"])
y = df["Target"]

# Step 7: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 9: Train model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 10: Evaluate model
r2 = model.score(X_test_scaled, y_test)
print(f"\n✅ Model trained successfully!")
print(f"📊 R² Score on test data: {r2:.4f}")

# Step 11: Save model and scaler
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/salt_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")
joblib.dump(X.columns.tolist(), "model/feature_names.pkl")

print("\n📁 Model files saved inside 'model/' folder.")


✅ Data loaded successfully!
Shape: (100000, 24)
Columns: ['Index', 'pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Color', 'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Source', 'Water Temperature', 'Air Temperature', 'Month', 'Day', 'Time of Day', 'Target']

✅ Model trained successfully!
📊 R² Score on test data: 0.6788

📁 Model files saved inside 'model/' folder.
