In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score


In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
data = pd.read_csv("/content/drive/MyDrive/ML Lab/Week 6/water_potability.csv")
data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [45]:
data.isnull().sum()

Unnamed: 0,0
ph,491
Hardness,0
Solids,0
Chloramines,0
Sulfate,781
Conductivity,0
Organic_carbon,0
Trihalomethanes,162
Turbidity,0
Potability,0


In [46]:
#data = data.dropna()
# Imputing into null values
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='mean')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [58]:
#imbalance
data['Potability'].value_counts()


Unnamed: 0_level_0,count
Potability,Unnamed: 1_level_1
0.0,1998
1.0,1278


In [59]:
zero  = data[data['Potability']==0]   #zero values in Potability column
one = data[data['Potability']==1]  # one values in Potability column


from sklearn.utils import resample
# minority class that  is 1, we need to upsample/increase that class so that there is no bias
# n_samples = 1998 means we want 1998 sample of class 1, since there are 1998 samples of class 0
data_minority_upsampled = resample(one, replace = True, n_samples = 1998)
#concatenate
data = pd.concat([zero, data_minority_upsampled])
data['Potability'].value_counts()



Unnamed: 0_level_0,count
Potability,Unnamed: 1_level_1
0.0,1998
1.0,1998


In [64]:
X = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
y = data['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
#new
from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = StandardScaler()
#scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [66]:
#Random forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_rf = np.where(y_pred != y_test)[0]
print(mismatch_indices_rf)

Accuracy: 0.82
[  5  12  14  19  21  22  24  25  26  33  34  38  43  52  54  61  66  74
  81  91  93  96  98 100 122 125 126 130 133 134 135 136 143 147 156 157
 161 164 175 183 199 202 211 218 223 225 232 239 248 249 250 272 275 282
 288 294 300 306 311 314 317 322 325 331 336 338 347 353 355 361 365 370
 377 380 397 403 404 413 417 419 441 446 448 455 457 467 479 481 487 506
 523 525 529 532 535 547 561 564 568 569 575 580 583 586 590 591 597 600
 604 607 608 609 610 615 620 632 633 634 635 639 656 659 661 663 667 676
 685 687 696 704 706 709 710 715 716 726 727 730 731 738 741 748 749 758
 760 770 789]


In [56]:
#svm
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_svm = np.where(y_pred != y_test)[0]
print(mismatch_indices_svm)

Accuracy: 0.70
[  1   4   7  11  13  19  20  27  45  50  51  53  61  62  64  66  67  68
  71  76  77  81  84  87  90  92  99 100 102 107 108 115 116 118 127 130
 134 137 139 142 144 146 147 166 167 173 177 180 181 186 188 189 193 194
 197 200 201 217 219 221 224 227 233 235 237 242 244 246 247 251 254 259
 261 264 267 272 276 285 287 290 291 292 293 296 298 303 307 308 310 315
 316 317 322 325 327 329 334 337 342 343 346 354 355 359 367 369 371 373
 374 375 378 379 381 382 383 384 389 398 407 408 411 412 413 417 418 419
 421 422 424 427 430 433 439 440 444 449 458 460 461 467 470 473 477 480
 481 489 490 491 492 494 496 501 504 510 511 520 522 523 524 525 526 531
 533 535 543 551 552 553 554 559 562 566 572 573 575 577 579 582 583 585
 586 595 598 600 602 607 609 610 612 614 615 624 628 631 633 636 637 638
 641 652]


In [50]:
# Feature Selection - Forward Selection
from sklearn.model_selection import cross_val_score

selected_features = []
model = SVC()
#model = RandomForestClassifier() #using random forest

num_features_to_select = 5

while len(selected_features) < num_features_to_select:
    best_score = -1
    best_feature = None

    for feature_idx in range(X.shape[1]):
        if feature_idx in selected_features:
            continue

        # Try adding the feature to the selected set
        candidate_features = selected_features + [feature_idx]

        # Evaluate the model's performance using cross-validation
        scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the best-performing feature
        if mean_score > best_score:
            best_score = mean_score
            best_feature = feature_idx

    if best_feature is not None:
        selected_features.append(best_feature)
        print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")

Selected Feature 1: 4, Mean Accuracy: 0.6172
Selected Feature 2: 5, Mean Accuracy: 0.6200
Selected Feature 3: 1, Mean Accuracy: 0.6108
Selected Feature 4: 0, Mean Accuracy: 0.6099
Selected Feature 5: 3, Mean Accuracy: 0.6102
