In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("/content/sample_data/water_potability.csv")

In [3]:
features = ["ph", "Hardness", "Solids"]
targets = ["Chloramines", "Organic_carbon", "Sulfate", "Potability"]


In [4]:
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [15]:

# Drop rows with missing values
df.dropna(subset=features + targets, inplace=True)

In [16]:
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3,0.777454,0.563402,0.000371,2,2,363.266516,2,100.341674,4.628771,0
4,1.266434,-0.452437,-0.464758,1,0,398.410813,0,31.997993,4.075075,0
5,-0.945687,-0.232250,0.775344,1,1,280.467916,0,54.917862,2.559708,0
6,1.980009,1.592261,0.775462,1,2,283.651634,1,84.603556,2.672989,0
7,0.978658,0.227194,-0.960685,0,0,474.607645,0,62.798309,4.401425,0
...,...,...,...,...,...,...,...,...,...,...
3267,1.201912,0.583979,-0.701682,0,0,390.410231,0,55.069304,4.613843,1
3268,-0.240422,0.348085,-0.549054,1,0,329.266002,2,28.878601,3.442983,1
3269,2.779034,-3.086963,1.747202,2,0,439.893618,2,41.558501,4.369264,1
3270,-0.639528,-0.282757,0.474820,2,1,415.886955,0,60.419921,3.669712,1


In [18]:

# Convert continuous targets to categorical using fixed bins
for target in ["Chloramines", "Organic_carbon", "Sulfate"]:
    df[target] = pd.cut(df[target], bins=3, labels=[0, 1, 2])  # Convert to 3 categories

In [19]:
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [20]:
X = df[features]

In [21]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

In [22]:
results = {}

In [23]:
for target in targets:
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results[target] = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[target][name] = acc
        print(f"\nModel: {name} | Target: {target}")
        print(classification_report(y_test, y_pred))


Model: RandomForest | Target: Chloramines
              precision    recall  f1-score   support

           0       0.40      0.39      0.39       149
           1       0.33      0.37      0.35       141
           2       0.37      0.34      0.36       134

    accuracy                           0.37       424
   macro avg       0.37      0.37      0.37       424
weighted avg       0.37      0.37      0.37       424


Model: GradientBoosting | Target: Chloramines
              precision    recall  f1-score   support

           0       0.39      0.32      0.35       149
           1       0.36      0.41      0.39       141
           2       0.36      0.38      0.37       134

    accuracy                           0.37       424
   macro avg       0.37      0.37      0.37       424
weighted avg       0.37      0.37      0.37       424


Model: SVM | Target: Chloramines
              precision    recall  f1-score   support

           0       0.50      0.29      0.37       149
     

In [26]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [32]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [40]:
for target in targets:
    print(f"Unique values in {target}: {df[target].unique()}")

Unique values in Chloramines: [2, 1, 0]
Categories (3, int64): [0 < 1 < 2]
Unique values in Organic_carbon: [2, 0, 1]
Categories (3, int64): [0 < 1 < 2]
Unique values in Sulfate: [2, 0, 1]
Categories (3, int64): [0 < 1 < 2]
Unique values in Potability: [0 1]


In [38]:
for target in targets:
    y = df[target]
    num_classes = len(y.unique())  # Get the number of unique categories

    models = {
        "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', objective="multi:softmax", num_class=3, random_state=42),
        "LightGBM": LGBMClassifier(n_estimators=200, learning_rate=0.1, random_state=42),
        "CatBoost": CatBoostClassifier(n_estimators=200, learning_rate=0.1, verbose=0, random_state=42)
    }

In [41]:
results = {}

# Train and evaluate models for each target variable
for target in targets:
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results[target] = {}
    for name, model in models.items(): # Changed models1 to models
        # Dynamically set num_class based on unique target values for XGBClassifier
        if name == "XGBoost":
            num_classes = len(y.unique())
            model.set_params(num_class=num_classes) # set num_class

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[target][name] = acc
        print(f"\nModel: {name} | Target: {target}")
        print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.




Model: XGBoost | Target: Chloramines
              precision    recall  f1-score   support

           0       0.36      0.33      0.34       149
           1       0.34      0.36      0.35       141
           2       0.34      0.36      0.35       134

    accuracy                           0.35       424
   macro avg       0.35      0.35      0.35       424
weighted avg       0.35      0.35      0.35       424

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1692, number of used features: 3
[LightGBM] [Info] Start training from score -1.111101
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.086277

Model: LightGBM | Target: Chloramines
              precision    recall  f1-score   support

           0       0.40      0.37      

Parameters: { "use_label_encoder" } are not used.




Model: XGBoost | Target: Organic_carbon
              precision    recall  f1-score   support

           0       0.29      0.37      0.32       122
           1       0.39      0.32      0.35       151
           2       0.41      0.38      0.39       151

    accuracy                           0.36       424
   macro avg       0.36      0.36      0.36       424
weighted avg       0.36      0.36      0.36       424

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1692, number of used features: 3
[LightGBM] [Info] Start training from score -1.063766
[LightGBM] [Info] Start training from score -1.118309
[LightGBM] [Info] Start training from score -1.114698

Model: LightGBM | Target: Organic_carbon
              precision    recall  f1-score   support

           0       0.28      0.34

Parameters: { "use_label_encoder" } are not used.




Model: XGBoost | Target: Sulfate
              precision    recall  f1-score   support

           0       0.29      0.41      0.34       122
           1       0.36      0.31      0.33       157
           2       0.37      0.30      0.33       145

    accuracy                           0.33       424
   macro avg       0.34      0.34      0.33       424
weighted avg       0.34      0.33      0.33       424

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1692, number of used features: 3
[LightGBM] [Info] Start training from score -1.063766
[LightGBM] [Info] Start training from score -1.127391
[LightGBM] [Info] Start training from score -1.105730

Model: LightGBM | Target: Sulfate
              precision    recall  f1-score   support

           0       0.31      0.44      0.36    

Parameters: { "use_label_encoder" } are not used.




Model: XGBoost | Target: Potability
              precision    recall  f1-score   support

           0       0.61      0.66      0.63       261
           1       0.38      0.34      0.36       163

    accuracy                           0.53       424
   macro avg       0.50      0.50      0.50       424
weighted avg       0.52      0.53      0.53       424

[LightGBM] [Info] Number of positive: 684, number of negative: 1008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1692, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.404255 -> initscore=-0.387766
[LightGBM] [Info] Start training from score -0.387766

Model: LightGBM | Target: Potability
              precision    recall  f1-score   support

           0       0.60      0.64      0.62       261
 

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Randomized search for CatBoost
param_dist = {
    'iterations': [200, 500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': uniform(0.01, 0.1),
    'l2_leaf_reg': [1, 3, 5],
    'subsample': uniform(0.7, 0.3),
    'border_count': [32, 64, 128]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=catboost, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2)

# Fit the model
random_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"Best RandomizedSearchCV Accuracy: {random_search.best_score_}")

# Retrain with the best model
best_catboost = random_search.best_estimator_

# Test the model
y_pred = best_catboost.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))



Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Hyperparameters: {'border_count': 32, 'depth': 6, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.01809007706594509, 'subsample': 0.7585585111121306}
Best RandomizedSearchCV Accuracy: 0.6052009456264775
Test Accuracy: 0.6014150943396226
              precision    recall  f1-score   support

           0       0.64      0.82      0.72       261
           1       0.47      0.26      0.33       163

    accuracy                           0.60       424
   macro avg       0.55      0.54      0.52       424
weighted avg       0.57      0.60      0.57       424



In [51]:
# Example of demo input with the correct feature names
demo_input = pd.DataFrame({
    'ph': [7.5],          # Example value for pH
    'Hardness': [150],    # Example value for Hardness (matching the expected feature name)
    'Solids': [1.0],      # Example value for Solids (or other feature names)
})

# Predict using the trained model for each target
chloramines_prediction = best_catboost.predict(demo_input)  # Chloramines target
organic_carbon_prediction = best_catboost.predict(demo_input)  # Organic Carbon target
sulfate_prediction = best_catboost.predict(demo_input)  # Sulfate target

# Output the predictions as numeric values
print(f"Chloramines Prediction: {chloramines_prediction[0]}")
print(f"Organic Carbon Prediction: {organic_carbon_prediction[0]}")
print(f"Sulfate Prediction: {sulfate_prediction[0]}")

# You can also directly output the predicted values as a dictionary or a dataframe:
prediction_output = {
    "Chloramines": chloramines_prediction[0],
    "Organic Carbon": organic_carbon_prediction[0],
    "Sulfate": sulfate_prediction[0],
}

print(prediction_output)



Chloramines Prediction: 0
Organic Carbon Prediction: 0
Sulfate Prediction: 0
{'Chloramines': 0, 'Organic Carbon': 0, 'Sulfate': 0}


In [53]:
df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 2116 entries, 3 to 3271
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   ph               2116 non-null   float64 
 1   Hardness         2116 non-null   float64 
 2   Solids           2116 non-null   float64 
 3   Chloramines      2116 non-null   category
 4   Sulfate          2116 non-null   category
 5   Conductivity     2116 non-null   float64 
 6   Organic_carbon   2116 non-null   category
 7   Trihalomethanes  2011 non-null   float64 
 8   Turbidity        2116 non-null   float64 
 9   Potability       2116 non-null   int64   
dtypes: category(3), float64(6), int64(1)
memory usage: 138.8 KB


In [58]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load your dataset
df = pd.read_csv('/content/sample_data/water_potability.csv')  # Replace with your actual dataset file path

# Convert categorical features to category type
categorical_cols = ['Chloramines', 'Sulfate', 'Organic_carbon']
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.astype('category'))

# Encode categorical features as integers
df['Chloramines'] = df['Chloramines'].cat.codes
df['Sulfate'] = df['Sulfate'].cat.codes
df['Organic_carbon'] = df['Organic_carbon'].cat.codes

# Split the data into features and targets
X = df[['ph', 'Hardness', 'Solids']]  # Input features (continuous)
y_chloramines = df['Chloramines']  # Target for Chloramines (encoded)
y_organic_carbon = df['Organic_carbon']  # Target for Organic Carbon (encoded)
y_sulfate = df['Sulfate']  # Target for Sulfate (encoded)
y_potability = df['Potability']  # Target for Potability (binary: 0 or 1)

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train_chloramines, y_test_chloramines = train_test_split(X, y_chloramines, test_size=0.2, random_state=42)
_, _, y_train_organic_carbon, y_test_organic_carbon = train_test_split(X, y_organic_carbon, test_size=0.2, random_state=42)
_, _, y_train_sulfate, y_test_sulfate = train_test_split(X, y_sulfate, test_size=0.2, random_state=42)
_, _, y_train_potability, y_test_potability = train_test_split(X, y_potability, test_size=0.2, random_state=42)

# Initialize CatBoostRegressor models
catboost_chloramines = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='RMSE')
catboost_organic_carbon = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='RMSE')
catboost_sulfate = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='RMSE')
catboost_potability = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='RMSE')

# Fit models
catboost_chloramines.fit(X_train, y_train_chloramines)
catboost_organic_carbon.fit(X_train, y_train_organic_carbon)
catboost_sulfate.fit(X_train, y_train_sulfate)
catboost_potability.fit(X_train, y_train_potability)

# Predict on the test set
y_pred_chloramines = catboost_chloramines.predict(X_test)
y_pred_organic_carbon = catboost_organic_carbon.predict(X_test)
y_pred_sulfate = catboost_sulfate.predict(X_test)
y_pred_potability = catboost_potability.predict(X_test)

# Evaluate models using RMSE (for regression tasks)
rmse_chloramines = np.sqrt(mean_squared_error(y_test_chloramines, y_pred_chloramines))
rmse_organic_carbon = np.sqrt(mean_squared_error(y_test_organic_carbon, y_pred_organic_carbon))
rmse_sulfate = np.sqrt(mean_squared_error(y_test_sulfate, y_pred_sulfate))
rmse_potability = np.sqrt(mean_squared_error(y_test_potability, y_pred_potability))

print(f'Chloramines RMSE: {rmse_chloramines}')
print(f'Organic Carbon RMSE: {rmse_organic_carbon}')
print(f'Sulfate RMSE: {rmse_sulfate}')
print(f'Potability RMSE: {rmse_potability}')

# Example prediction for new input data
demo_input = pd.DataFrame({
    'ph': [7.5],          # Example value for pH
    'Hardness': [150],    # Example value for Hardness
    'Solids': [1.0],      # Example value for Solids
})

# Predict using the trained models
chloramines_prediction = catboost_chloramines.predict(demo_input)
organic_carbon_prediction = catboost_organic_carbon.predict(demo_input)
sulfate_prediction = catboost_sulfate.predict(demo_input)
potability_prediction = catboost_potability.predict(demo_input)

# Output predictions for Chloramines, Organic Carbon, Sulfate, and Potability
print(f"Chloramines Prediction: {chloramines_prediction[0]}")
print(f"Organic Carbon Prediction: {organic_carbon_prediction[0]}")
print(f"Sulfate Prediction: {sulfate_prediction[0]}")
print(f"Potability Prediction: {potability_prediction[0]}")




0:	learn: 945.9703379	total: 8.31ms	remaining: 4.15s
1:	learn: 944.0638437	total: 9.79ms	remaining: 2.44s
2:	learn: 942.4479872	total: 11.1ms	remaining: 1.84s
3:	learn: 940.6716222	total: 12.4ms	remaining: 1.54s
4:	learn: 939.2429042	total: 14ms	remaining: 1.38s
5:	learn: 937.8589607	total: 15.6ms	remaining: 1.28s
6:	learn: 937.0203796	total: 17.1ms	remaining: 1.21s
7:	learn: 935.6159770	total: 18.7ms	remaining: 1.15s
8:	learn: 934.5453023	total: 20.3ms	remaining: 1.1s
9:	learn: 933.2868351	total: 21.8ms	remaining: 1.07s
10:	learn: 932.5311602	total: 23.3ms	remaining: 1.04s
11:	learn: 931.0947056	total: 25ms	remaining: 1.02s
12:	learn: 929.3890290	total: 26.5ms	remaining: 994ms
13:	learn: 928.2228133	total: 28.2ms	remaining: 978ms
14:	learn: 927.4557700	total: 29.7ms	remaining: 961ms
15:	learn: 926.4513429	total: 31.3ms	remaining: 946ms
16:	learn: 925.0499546	total: 32.8ms	remaining: 932ms
17:	learn: 924.0021399	total: 34.4ms	remaining: 920ms
18:	learn: 923.0249372	total: 36ms	remainin

In [59]:
# Predicted values
chloramines_prediction = 2701.249742321558
organic_carbon_prediction = 1880.4992034511588
sulfate_prediction = 1846.6893482961445
potability_prediction = 0.6108408149660143

# Convert predictions to more readable values (e.g., round off Chloramines, Sulfate, Organic Carbon)
chloramines_prediction_rounded = round(chloramines_prediction, 2)
organic_carbon_prediction_rounded = round(organic_carbon_prediction, 2)
sulfate_prediction_rounded = round(sulfate_prediction, 2)

# Potability decision (round to 0 or 1)
potability_status = "Safe for consumption" if round(potability_prediction) == 1 else "Not safe for consumption"

# Format and display the output as a Water Quality Report
print("Water Quality Report")
print(f"Chloramines: {chloramines_prediction_rounded} mg/L")
print(f"Sulfate: {sulfate_prediction_rounded} mg/L")
print(f"Organic Carbon: {organic_carbon_prediction_rounded} mg/L")
print(f"Potability Status: {potability_status}")

Water Quality Report
Chloramines: 2701.25 mg/L
Sulfate: 1846.69 mg/L
Organic Carbon: 1880.5 mg/L
Potability Status: Safe for consumption


In [82]:
# Example of a new test sample
demo_input = pd.DataFrame({
    'ph': [6.3],          # Example value for pH
    'Hardness': [45],     # Example value for Hardness
    'Solids': [20],      # Example value for Solids
})

# Predict using the trained models
chloramines_prediction = catboost_chloramines.predict(demo_input)[0]
organic_carbon_prediction = catboost_organic_carbon.predict(demo_input)[0]
sulfate_prediction = catboost_sulfate.predict(demo_input)[0]
potability_prediction = catboost_potability.predict(demo_input)[0]

# Divide Chloramines and Sulfate predictions by 1000 for unit conversion
chloramines_prediction_rounded = round(chloramines_prediction / 1000, 2)  # Divide by 1000 and round
organic_carbon_prediction_rounded = round(organic_carbon_prediction/1000, 2)  # No change needed, just rounding
sulfate_prediction_rounded = round(sulfate_prediction , 2)  # Divide by 1000 and round

# Potability decision (round to 0 or 1)
potability_status = "Safe for consumption" if round(potability_prediction) == 1 else "Not safe for consumption"

# Format and display the output as a Water Quality Report
print("Water Quality Report")
print(f"Chloramines: {chloramines_prediction_rounded} mg/L")
print(f"Sulfate: {sulfate_prediction_rounded} mg/L")
print(f"Organic Carbon: {organic_carbon_prediction_rounded} mg/L")
print(f"Potability Status: {potability_status}")



Water Quality Report
Chloramines: 0.6 mg/L
Sulfate: 697.47 mg/L
Organic Carbon: 2.23 mg/L
Potability Status: Safe for consumption


In [83]:
import pickle

# Create a dictionary to store the models
models = {
    'catboost_chloramines': catboost_chloramines,
    'catboost_organic_carbon': catboost_organic_carbon,
    'catboost_sulfate': catboost_sulfate,
    'catboost_potability': catboost_potability
}

# Save the models to a single .pkl file
with open('catboost_models.pkl', 'wb') as f:
    pickle.dump(models, f)

print("All models have been saved to 'catboost_models.pkl' successfully!")

All models have been saved to 'catboost_models.pkl' successfully!
