In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/water-potability/water_potability.csv")

In [2]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [3]:
print(len(df))

3276


In [4]:
df['Potability'].value_counts()

Potability
0    1998
1    1278
Name: count, dtype: int64

In [5]:
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [6]:
df.fillna(df.median(), inplace=True)

In [7]:
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [8]:
# Apply Min-Max normalization to scale values between 0 and 1
df_normalized = (df - df.min()) / (df.max() - df.min())

# Display the normalized DataFrame
print(df_normalized)

            ph  Hardness    Solids  Chloramines   Sulfate  Conductivity  \
0     0.502625  0.571139  0.336096     0.543891  0.680385      0.669439   
1     0.265434  0.297400  0.300611     0.491839  0.579704      0.719411   
2     0.578509  0.641311  0.321619     0.698543  0.579704      0.414652   
3     0.594055  0.605536  0.356244     0.603314  0.647347      0.317880   
4     0.649445  0.484851  0.289922     0.484900  0.514545      0.379337   
...        ...       ...       ...          ...       ...           ...   
3271  0.333436  0.530482  0.775947     0.533436  0.656047      0.603192   
3272  0.557775  0.530016  0.279263     0.603473  0.579704      0.368912   
3273  0.672822  0.465486  0.539101     0.547807  0.579704      0.438152   
3274  0.366197  0.664407  0.191490     0.465860  0.579704      0.387157   
3275  0.562477  0.535635  0.280484     0.560259  0.579704      0.255266   

      Organic_carbon  Trihalomethanes  Turbidity  Potability  
0           0.313402         0.69975

In [9]:
# Splitting into X and Y
Y = df["Potability"]  # Target variable
X = df.drop("Potability", axis=1)  # Predictor variables, dropping the target column

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (2620, 9)
Y_train shape: (2620,)
X_test shape: (656, 9)
Y_test shape: (656,)


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [12]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)  
    results[model_name] = accuracy

# Display results
print("Accuracy Scores:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

Accuracy Scores:
Logistic Regression: 0.6280
Random Forest: 0.6738
SVM: 0.6280
Decision Tree: 0.5762


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [14]:

# Define models with their respective parameter grids for GridSearchCV
models = {
    'Logistic Regression': (LogisticRegression(random_state=42), {'C': [0.1, 1.0, 10.0]}),
    'Random Forest': (RandomForestClassifier(random_state=42), {'n_estimators': [50, 100, 200]}),
    'SVM': (SVC(random_state=42), {'C': [0.1, 1.0, 10.0], 'gamma': ['scale', 'auto']}),
    'Decision Tree': (DecisionTreeClassifier(random_state=42), {'max_depth': [None, 10, 20, 30]})
}

# Perform GridSearchCV for each model
results = {}
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, Y_train)
    best_model = grid_search.best_estimator_
    Y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'accuracy': accuracy
    }

# Display results
print("Grid Search CV Results:")
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print()

Grid Search CV Results:
Model: Logistic Regression
Best Parameters: {'C': 10.0}
Accuracy: 0.6280

Model: Random Forest
Best Parameters: {'n_estimators': 200}
Accuracy: 0.6814

Model: SVM
Best Parameters: {'C': 10.0, 'gamma': 'scale'}
Accuracy: 0.6265

Model: Decision Tree
Best Parameters: {'max_depth': 10}
Accuracy: 0.6143



In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint, uniform

models = {
    'Logistic Regression': (LogisticRegression(random_state=42), {'C': uniform(0.1, 10.0)}),
    'Random Forest': (RandomForestClassifier(random_state=42), {'n_estimators': randint(50, 200)}),
    'SVM': (SVC(random_state=42), {'C': uniform(0.1, 10.0), 'gamma': ['scale', 'auto']}),
    'Decision Tree': (DecisionTreeClassifier(random_state=42), {'max_depth': [None, 10, 20, 30]})
}

# Perform RandomizedSearchCV for each model
results = {}
for model_name, (model, param_dist) in models.items():
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X_train, Y_train)
    best_model = random_search.best_estimator_
    Y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    results[model_name] = {
        'best_params': random_search.best_params_,
        'accuracy': accuracy
    }

# Display results
print("Randomized Search CV Results:")
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Randomized Search CV Results:
Model: Logistic Regression
Best Parameters: {'C': 3.845401188473625}
Accuracy: 0.6280

Model: Random Forest
Best Parameters: {'n_estimators': 70}
Accuracy: 0.6860

Model: SVM
Best Parameters: {'C': 6.086584841970366, 'gamma': 'scale'}
Accuracy: 0.6265

Model: Decision Tree
Best Parameters: {'max_depth': 10}
Accuracy: 0.6143



In [16]:
from sklearn.model_selection import train_test_split, cross_val_score
from numpy import mean

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Perform k-fold cross-validation for each model
results = {}
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    results[model_name] = cv_scores

# Display cross-validation results
print("Cross-validation Scores:")
for model_name, scores in results.items():
    print(f"Model: {model_name}")
    print(f"Mean Accuracy: {mean(scores):.4f}")
    print(f"Individual Fold Accuracies: {scores}")
    print()

Cross-validation Scores:
Model: Logistic Regression
Mean Accuracy: 0.6099
Individual Fold Accuracies: [0.6097561  0.61068702 0.61068702 0.60916031 0.60916031]

Model: Random Forest
Mean Accuracy: 0.6407
Individual Fold Accuracies: [0.61737805 0.65496183 0.64580153 0.59541985 0.69007634]

Model: SVM
Mean Accuracy: 0.6099
Individual Fold Accuracies: [0.6097561  0.61068702 0.61068702 0.60916031 0.60916031]

Model: Decision Tree
Mean Accuracy: 0.5751
Individual Fold Accuracies: [0.54573171 0.59541985 0.61374046 0.53435115 0.58625954]

