In [None]:
# Utilities
import warnings
warnings.filterwarnings('ignore')

# Core Python Libraries
import numpy as np          
import pandas as pd            

# Statistics & Scientific Computing
import scipy.stats as stats
import statsmodels.api as sm

# Machine Learning - Preprocessing
from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler,
    StandardScaler
)
from sklearn.impute import SimpleImputer

# Machine Learning - Model Selection & Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc,
    roc_auc_score
)

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

# Advanced Models
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

# Handling Class Imbalance
from imblearn.over_sampling import SMOTE, SMOTEN

# MODELING

With exploratory data analysis complete and a clear understanding of the dataset, we now transition into building predictive models.  
Our target variable is **status_group**, which categorizes water points as *functional*, *functional needs repair*, or *non functional*.  
The predictor variables used for modeling are:

- **gps_height**  
- **installer**  
- **longitude, latitude**  
- **basin**  
- **region**  
- **population**  
- **permit**  
- **construction_year**  
- **extraction_type_group**  
- **management_group**  
- **payment_type**  
- **water_quality**  
- **quantity**  
- **source**  
- **waterpoint_type_group**

## Preprocessing
Before modeling, we will:
- Handle missing values and ensure data completeness.  
- Encode categorical features (e.g installer, basin, region).  
- Scale numerical variables where appropriate (e.g gps_height, population).  
- Address class imbalance in status_group using techniques such as **SMOTE/SMOTEN**.  

## Baseline Model
We begin with a **Dummy Classifier** to establish a baseline accuracy. This provides a benchmark to measure the value added by more sophisticated models.

## Candidate Models
We then train and evaluate several classifiers:  
- **Logistic Regression** – interpretable baseline linear model.  
- **Decision Tree Classifier** – captures nonlinear relationships and feature interactions.  
- **Random Forest Classifier** – ensemble approach to reduce variance and improve stability.  
- **K-Nearest Neighbors (KNN)** – instance-based learning for comparison.  
- **XGBoost Classifier** – gradient boosting ensemble, often highly effective in structured data.  

## Model Evaluation
Models will be compared on:  
- **Accuracy** – overall correctness.  
- **Precision, Recall, F1-score** – to account for imbalanced classes.  
- **Confusion Matrix** – to visualize class-level performance.  
- **ROC Curve and AUC** – to assess discriminative power.  

## Model Selection
Based on evaluation metrics, we will select the best-performing model. The winning model should not only achieve high predictive performance but also generalize well to unseen data. If multiple models perform competitively, additional considerations such as interpretability and computational efficiency will guide the final choice.

## DATA PREPROCESSING

In [6]:
# Training values (features)
train_values_df = pd.read_csv("C:\\Users\\lenovo\\OneDrive\\Desktop\\DS\\PROJECTS\\tanzania-water-well-status-prediction\\Data\\Training set values.csv")

# Training labels (targets)
train_labels_df = pd.read_csv("C:\\Users\\lenovo\\OneDrive\\Desktop\\DS\\PROJECTS\\tanzania-water-well-status-prediction\\Data\\Training set labels.csv")

# Load training data
train_df = train_values_df.merge(train_labels_df, on = 'id')
train_df.shape

(59400, 41)

In [5]:
# ===============================
# 1. Separate features and target
# ===============================
X = train_df.drop('status_group', axis=1)
y = train_df['status_group']

# ===============================
# 2. Encode categorical variables
# ===============================
from sklearn.preprocessing import LabelEncoder

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode categorical features
cat_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# ===============================
# 3. Scale numeric variables
# ===============================
from sklearn.preprocessing import StandardScaler

num_cols = ['gps_height', 'longitude', 'latitude', 'population', 'construction_year']
scaler = StandardScaler()
X_encoded[num_cols] = scaler.fit_transform(X_encoded[num_cols])

# ===============================
# 4. Train-test split
# ===============================
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ===============================
# 5. Handle class imbalance
# ===============================
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Original training set shape:", X_train.shape)
print("Resampled training set shape:", X_train_res.shape)

MemoryError: Unable to allocate 1.13 MiB for an array with shape (20, 59400) and data type bool