In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
df = pd.read_excel('/content/aquaattributes.xlsx')
#print (df.head)

In [3]:
df.isnull().sum()

Unnamed: 0,0
State,0
Temperature,34
D.O,9
pH,1
Conductivity,37
B.O.D,68
Nitrate,209
Fecalcaliform,188
Totalcaliform,135
class,0


In [4]:
# Convert all numerical columns to numeric
num_cols = ["Temperature", "D.O", "pH", "Conductivity", "B.O.D", "Nitrate", "Fecalcaliform", "Totalcaliform"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')  # Convert to numeric, setting errors to NaN

In [5]:
# As the values of temp, D.O, pH and conductivty is below 5% we shall use median to fill missing values
df["Temperature"].fillna(df["Temperature"].median(), inplace=True)
df["D.O"].fillna(df["D.O"].median(), inplace=True)
df["pH"].fillna(df["pH"].median(), inplace=True)
df["Conductivity"].fillna(df["Conductivity"].median(), inplace=True)
df["B.O.D"].fillna(df["B.O.D"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Temperature"].fillna(df["Temperature"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["D.O"].fillna(df["D.O"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [6]:
#When missing values are high (10-30%), when data has patterns that simple mean/median can't capture we use KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Select only numeric columns for imputation
numeric_cols = ["Nitrate", "Fecalcaliform", "Totalcaliform"]
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [7]:
df = pd.get_dummies(df, columns=["State"], drop_first=True)

In [8]:
print("missing values are :")
print(df.isnull().sum())

missing values are :
Temperature                             0
D.O                                     0
pH                                      0
Conductivity                            0
B.O.D                                   0
Nitrate                                 0
Fecalcaliform                           0
Totalcaliform                           0
class                                   0
State_ANDHRA PRADESH                    0
State_ASSAM                             0
State_BIHAR                             0
State_CHANDIGARH                        0
State_CHHATTISGARH                      0
State_DAMAN & DIU                       0
State_DAMAN, DIU, DADRA NAGAR HAVELI    0
State_DELHI                             0
State_GOA                               0
State_GUJARAT                           0
State_HARYANA                           0
State_HIMACHAL PRADESH                  0
State_JAMMU & KASHMIR                   0
State_JHARKHAND                         0
State_KARNATA

In [9]:
# Combine duplicate Maharashtra columns
df['State_MAHARASHTRA'] = df['State_MAHARASHTRA'] + df['State_MAHARASHTRA ']
df.drop(columns=['State_MAHARASHTRA '], inplace=True)

# Combine duplicate ORISSA columns
df['State_ORISSA'] = df['State_ORISSA'] + df['State_ORISSA ']
df.drop(columns=['State_ORISSA '], inplace=True)

In [10]:
df.to_csv("wqa_OHE.csv", index=False)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load your cleaned dataset
df = pd.read_csv('/content/wqa_OHE.csv')

# Step 1: Separate features and target
X = df.drop(columns=['class'])  # features
y = df['class']                 # target


In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
# Normalize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Define base models
svm = SVC(probability=True, random_state=42)
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

In [14]:
# Create the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('rf', rf),
        ('lr', lr),
        ('gb', gb)
    ],
    voting='soft'  # Use 'soft' voting for probability-based decision
)

In [15]:
voting_clf.fit(X_train, y_train)

In [16]:
y_pred = voting_clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

Classification Report:

              precision    recall  f1-score   support

          no       0.98      0.93      0.96        59
         yes       0.98      1.00      0.99       214

    accuracy                           0.98       273
   macro avg       0.98      0.96      0.97       273
weighted avg       0.98      0.98      0.98       273



In [17]:
import pickle

# Save model to a file
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)

In [18]:
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

print("Model and scaler saved successfully!")

Model and scaler saved successfully!
