In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import libraries

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("/content/drive/My Drive/water_potability.csv")

## Understand the data

In [5]:
# Initial data overview
print("INITIAL DATA OVERVIEW:")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head())
print("\nMissing values:")
print(df.isnull().sum())

INITIAL DATA OVERVIEW:
Dataset shape: (3276, 10)
Columns: ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability']

First 5 rows:
         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.

## Cleaning Data

In [6]:
print("\nDATA CLEANING:")
numerical_cols = df.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)
        print(f"Filled {col} missing values with median")
print(f"Missing values after cleaning: {df.isnull().sum().sum()}")


DATA CLEANING:
Filled ph missing values with median
Filled Sulfate missing values with median
Filled Trihalomethanes missing values with median
Missing values after cleaning: 0


## Work

In [7]:
# Basic EDA for understanding
print("\nBASIC EDA:")
print("Target variable distribution:")
print(df['Potability'].value_counts())
print(f"Class balance: {df['Potability'].value_counts(normalize=True)}")


BASIC EDA:
Target variable distribution:
Potability
0    1998
1    1278
Name: count, dtype: int64
Class balance: Potability
0    0.60989
1    0.39011
Name: proportion, dtype: float64


In [8]:
# Correlation analysis for feature engineering insights
correlation_matrix = df.corr()
print("\nTop correlations with target:")
target_corr = abs(correlation_matrix['Potability']).sort_values(ascending=False)
print(target_corr.head(6))


Top correlations with target:
Potability        1.000000
Solids            0.033743
Organic_carbon    0.030001
Chloramines       0.023779
Sulfate           0.020476
Hardness          0.013837
Name: Potability, dtype: float64


## REQUIREMENT 1: DEFINE 3-5 CLASS LABELS & BALANCED DISTRIBUTION

In [9]:
# Check if target variable is numerical or categorical
print(f"Target variable 'Potability' type: {df['Potability'].dtype}")
print(f"Unique values: {df['Potability'].unique()}")
print(f"Current distribution: {df['Potability'].value_counts().sort_index().to_dict()}")

# According to requirement: if categorical, leave as is. If numerical, create 3-5 classes
if df['Potability'].dtype in ['int64', 'float64'] and len(df['Potability'].unique()) > 10:
    print("\nTarget is numerical with many values - creating balanced classes...")
    df['potability_classes'] = pd.qcut(df['Potability'], q=4, labels=['Class_1', 'Class_2', 'Class_3', 'Class_4'])
    df['target_variable'] = pd.Categorical(df['potability_classes']).codes
    target_col = 'target_variable'
    print("Created 4 balanced classes from numerical target")
else:
    print("\nTarget is categorical (binary) - keeping as per requirement")
    target_col = 'Potability'
    df['target_variable'] = df['Potability']
    print("Using original binary classification")

# DEMONSTRATION: Create 4-class alternative from pH to show understanding
print("\nDEMONSTRATION - Creating 4 balanced classes from pH (numerical feature):")
df['pH_quartiles'] = pd.qcut(df['ph'], q=4, labels=['Very_Acidic', 'Acidic', 'Basic', 'Very_Basic'])
df['pH_4class'] = pd.Categorical(df['pH_quartiles']).codes

print("pH-based 4-class distribution:")
ph_class_dist = df['pH_4class'].value_counts().sort_index()
print(ph_class_dist.to_dict())
print(f"Balanced distribution: {df['pH_4class'].value_counts(normalize=True).round(3).to_dict()}")


Target variable 'Potability' type: int64
Unique values: [0 1]
Current distribution: {0: 1998, 1: 1278}

Target is categorical (binary) - keeping as per requirement
Using original binary classification

DEMONSTRATION - Creating 4 balanced classes from pH (numerical feature):
pH-based 4-class distribution:
{0: 819, 1: 1065, 2: 573, 3: 819}
Balanced distribution: {1: 0.325, 0: 0.25, 3: 0.25, 2: 0.175}


## REQUIREMENT 2: NORMALIZATION & INTEGER CATEGORIZATION

STEP 2A: INTEGER CATEGORIZATION

In [10]:
# Categorize Turbidity into 3 levels
df['turbidity_category'] = pd.cut(df['Turbidity'], bins=3, labels=['Low', 'Medium', 'High'])
df['turbidity_cat_num'] = pd.Categorical(df['turbidity_category']).codes

# Categorize Hardness into 4 levels
df['hardness_category'] = pd.cut(df['Hardness'], bins=4, labels=['Soft', 'Moderate', 'Hard', 'Very_Hard'])
df['hardness_cat_num'] = pd.Categorical(df['hardness_category']).codes

# Categorize pH into acid/neutral/basic levels
df['ph_category'] = pd.cut(df['ph'], bins=[0, 6.5, 7.5, 14], labels=['Acidic', 'Neutral', 'Basic'])
df['ph_cat_num'] = pd.Categorical(df['ph_category']).codes

# Categorize Chloramines levels
df['chloramines_category'] = pd.cut(df['Chloramines'], bins=3, labels=['Low', 'Medium', 'High'])
df['chloramines_cat_num'] = pd.Categorical(df['chloramines_category']).codes

print("Integer categorization completed:")
print(f"- Turbidity categories: {df['turbidity_cat_num'].value_counts().sort_index()}")
print(f"- Hardness categories: {df['hardness_cat_num'].value_counts().sort_index()}")
print(f"- pH categories: {df['ph_cat_num'].value_counts().sort_index().to_dict()}")
print(f"- Chloramines categories: {df['chloramines_cat_num'].value_counts().sort_index().to_dict()}")

Integer categorization completed:
- Turbidity categories: turbidity_cat_num
0     554
1    2392
2     330
Name: count, dtype: int64
- Hardness categories: hardness_cat_num
0      41
1    1100
2    2001
3     134
Name: count, dtype: int64
- pH categories: {-1: 1, 0: 967, 1: 1249, 2: 1059}
- Chloramines categories: {0: 181, 1: 2671, 2: 424}


STEP 2B: NORMALIZATION


In [11]:
# Normalization of numerical features
scaler = StandardScaler()
exclude_from_scaling = ['Potability', 'target_variable', 'turbidity_category', 'pH_quartiles',
                       'hardness_category', 'turbidity_cat_num', 'hardness_cat_num',
                       'ph_category', 'ph_cat_num', 'chloramines_category', 'chloramines_cat_num']
feature_cols = [col for col in df.columns if col not in exclude_from_scaling]
df_scaled = df.copy()
df_scaled[feature_cols] = scaler.fit_transform(df[feature_cols])

print("Features normalized using StandardScaler:")
print(f"Normalized features: {feature_cols}")

Features normalized using StandardScaler:
Normalized features: ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'pH_4class']


## REQUIREMENT 3: FEATURE ENGINEERING BASED ON EDA

In [12]:
# Create composite features based on EDA correlations and domain knowledge
print("Creating composite features based on EDA insights:")

# Ratio features (based on correlation analysis)
df_scaled['ph_hardness_ratio'] = df_scaled['ph'] / (df_scaled['Hardness'] + 0.001)
df_scaled['solids_conductivity_ratio'] = df_scaled['Solids'] / (df_scaled['Conductivity'] + 0.001)
df_scaled['chloramines_sulfate_ratio'] = df_scaled['Chloramines'] / (df_scaled['Sulfate'] + 0.001)

# Product features (interaction effects)
df_scaled['organic_carbon_trihalomethanes_product'] = df_scaled['Organic_carbon'] * df_scaled['Trihalomethanes']
df_scaled['ph_conductivity_interaction'] = df_scaled['ph'] * df_scaled['Conductivity']

# Categorical features as integers
df_scaled['turbidity_cat_num'] = df['turbidity_cat_num']
df_scaled['hardness_cat_num'] = df['hardness_cat_num']
# Add the new categorical features to scaled dataframe
df_scaled['ph_cat_num'] = df['ph_cat_num']
df_scaled['chloramines_cat_num'] = df['chloramines_cat_num']

engineered_features = ['ph_hardness_ratio', 'solids_conductivity_ratio', 'chloramines_sulfate_ratio',
                      'organic_carbon_trihalomethanes_product', 'ph_conductivity_interaction',
                      'turbidity_cat_num', 'hardness_cat_num', 'ph_cat_num', 'chloramines_cat_num']

print("New engineered features created:")
for i, feature in enumerate(engineered_features, 1):
    print(f"{i}. {feature}")

Creating composite features based on EDA insights:
New engineered features created:
1. ph_hardness_ratio
2. solids_conductivity_ratio
3. chloramines_sulfate_ratio
4. organic_carbon_trihalomethanes_product
5. ph_conductivity_interaction
6. turbidity_cat_num
7. hardness_cat_num
8. ph_cat_num
9. chloramines_cat_num


# REQUIREMENT 4: CREATE 5 FEATURE SETS & DECISION TREE COMPARISON

In [13]:
# Define original features
original_features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate',
                    'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']

# Get top correlated features with target
available_features = [col for col in df_scaled.columns if col not in ['Potability', 'target_variable',
                     'turbidity_category', 'pH_quartiles', 'hardness_category', 'ph_category', 'chloramines_category']]
target_corr = abs(df_scaled[available_features + [target_col]].corr()[target_col]).sort_values(ascending=False)
top_corr_features = target_corr.drop(target_col).head(5).index.tolist()

STEP 4A: DEFINE 5 FEATURE SETS

In [14]:
# Updated feature_sets with more comprehensive sets
feature_sets = {
    'Set1_Original_9Features': original_features,
    'Set2_Top5Correlated': top_corr_features,
    'Set3_Original_Plus_Ratios': original_features + ['ph_hardness_ratio', 'solids_conductivity_ratio', 'chloramines_sulfate_ratio'],
    'Set4_Engineered_Features': ['ph_hardness_ratio', 'organic_carbon_trihalomethanes_product', 'turbidity_cat_num', 'hardness_cat_num', 'ph_conductivity_interaction'],
    'Set5_Best_Mixed': ['Sulfate', 'Conductivity', 'Organic_carbon', 'ph_hardness_ratio', 'organic_carbon_trihalomethanes_product', 'turbidity_cat_num'],
    'Set6_All_Categorical': ['turbidity_cat_num', 'hardness_cat_num', 'ph_cat_num', 'chloramines_cat_num']  # NEW SET
}

print("Feature sets defined:")
for i, (name, features) in enumerate(feature_sets.items(), 1):
    print(f"{i}. {name}: {len(features)} features")
    print(f"   Features: {features}")

Feature sets defined:
1. Set1_Original_9Features: 9 features
   Features: ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
2. Set2_Top5Correlated: 5 features
   Features: ['Solids', 'Organic_carbon', 'chloramines_cat_num', 'Chloramines', 'ph_hardness_ratio']
3. Set3_Original_Plus_Ratios: 12 features
   Features: ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'ph_hardness_ratio', 'solids_conductivity_ratio', 'chloramines_sulfate_ratio']
4. Set4_Engineered_Features: 5 features
   Features: ['ph_hardness_ratio', 'organic_carbon_trihalomethanes_product', 'turbidity_cat_num', 'hardness_cat_num', 'ph_conductivity_interaction']
5. Set5_Best_Mixed: 6 features
   Features: ['Sulfate', 'Conductivity', 'Organic_carbon', 'ph_hardness_ratio', 'organic_carbon_trihalomethanes_product', 'turbidity_cat_num']
6. Set6_All_Categorical: 4 features
   Features: ['

STEP 4B: TRAIN DECISION TREE MODELS

In [15]:
# Prepare data for modeling
X_all = df_scaled[available_features]
y = df_scaled[target_col]

print(f"Using '{target_col}' as target variable")
print(f"Target distribution: {y.value_counts().sort_index().to_dict()}")

Using 'Potability' as target variable
Target distribution: {0: 1998, 1: 1278}


In [16]:
# Train and compare models
results = []
for set_name, features in feature_sets.items():
    print(f"\nTraining {set_name}...")

    # Select available features
    available_set_features = [f for f in features if f in X_all.columns]
    X_subset = X_all[available_set_features]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_subset, y, test_size=0.3, random_state=42, stratify=y
    )

    # Train decision tree
    dt = DecisionTreeClassifier(
        random_state=42,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5
    )
    dt.fit(X_train, y_train)

    # Make predictions
    y_pred = dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Store results (enhanced version)
    results.append({
        'Feature_Set': set_name,
        'Num_Features': len(available_set_features),
        'Features_Used': ', '.join(available_set_features[:3]) + ('...' if len(available_set_features) > 3 else ''),
        'Accuracy': round(accuracy, 4),
        'Tree_Depth': dt.get_depth(),
        'Num_Leaves': dt.get_n_leaves(),
        'Top_Feature': available_set_features[np.argmax(dt.feature_importances_)],  # NEW
        'Feature_Importance': round(max(dt.feature_importances_), 4)  # NEW
    })

    print(f" {set_name}: Accuracy = {accuracy:.4f}")


Training Set1_Original_9Features...
 Set1_Original_9Features: Accuracy = 0.6338

Training Set2_Top5Correlated...
 Set2_Top5Correlated: Accuracy = 0.5788

Training Set3_Original_Plus_Ratios...
 Set3_Original_Plus_Ratios: Accuracy = 0.6419

Training Set4_Engineered_Features...
 Set4_Engineered_Features: Accuracy = 0.5921

Training Set5_Best_Mixed...
 Set5_Best_Mixed: Accuracy = 0.5972

Training Set6_All_Categorical...
 Set6_All_Categorical: Accuracy = 0.6073


STEP 4C: COMPARISON RESULTS TABLE

In [17]:
# Display results table
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False)
print("FINAL COMPARISON TABLE:")
print(results_df.to_string(index=False))

FINAL COMPARISON TABLE:
              Feature_Set  Num_Features                                                                   Features_Used  Accuracy  Tree_Depth  Num_Leaves       Top_Feature  Feature_Importance
Set3_Original_Plus_Ratios            12                                                         ph, Hardness, Solids...    0.6419          10         114           Sulfate              0.1623
  Set1_Original_9Features             9                                                         ph, Hardness, Solids...    0.6338          10         125           Sulfate              0.1987
     Set6_All_Categorical             4                              turbidity_cat_num, hardness_cat_num, ph_cat_num...    0.6073           9          57        ph_cat_num              0.4643
          Set5_Best_Mixed             6                                        Sulfate, Conductivity, Organic_carbon...    0.5972          10          94 ph_hardness_ratio              0.2986
 Set4_Engineered

In [18]:
# Best model analysis
best_model = results_df.iloc[0]
print(f"\nBEST PERFORMING MODEL:")
print(f"Feature Set: {best_model['Feature_Set']}")
print(f"Accuracy: {best_model['Accuracy']}")
print(f"Number of Features: {best_model['Num_Features']}")
print(f"Most Important Feature: {best_model['Top_Feature']} ({best_model['Feature_Importance']})")


BEST PERFORMING MODEL:
Feature Set: Set3_Original_Plus_Ratios
Accuracy: 0.6419
Number of Features: 12
Most Important Feature: Sulfate (0.1623)


## SUMMARY OF OBSERVATIONS
Key findings from the model comparison:
1. Feature engineering impact on model performance
2. Original vs engineered features effectiveness
3. Optimal number of features for this dataset
4. Class balancing effect on model accuracy
5. Decision tree complexity vs performance trade-off