In [18]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)



In [5]:
# Load the preprocessed dataset
df_encoded = pd.read_csv('../data/processed/preprocessed_data.csv')

### Dummy Classifier

In [6]:
df_encoded['status_group'].value_counts(normalize=True)

status_group
2    0.543081
0    0.384242
1    0.072677
Name: proportion, dtype: float64

If a dummy classifier that always returns 1 (funcitonal, functional needs repair), it would be accurate 54.3% of the time

## 1) Linear Models

###  1.1) Logistic Regression

In [7]:
X = df_encoded.drop('status_group', axis=1)
y = df_encoded['status_group']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

# Standardize the features (mean = 0, variance = 1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the logistic regression model
model = LogisticRegression(max_iter = 1000)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)


Accuracy: 0.8624579124579125

Confusion Matrix:
 [[4061   46  458]
 [ 187  172  504]
 [ 378   61 6013]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.89      0.88      4565
           1       0.62      0.20      0.30       863
           2       0.86      0.93      0.90      6452

    accuracy                           0.86     11880
   macro avg       0.79      0.67      0.69     11880
weighted avg       0.85      0.86      0.85     11880



**Observation**:<br>
Predicted |   0    |    1    |    2    |<br>
Actual <br>
------- 0 |  4061  |   46    |   458   |<br>
------- 1 |   187  |   172   |   504   |<br>
------- 2 |   378  |    61   |   6013  |<br>

---

### Cross Validation (Logistic Regression)

In [17]:
# Define the logistic regression model with desired hyperparameters
model = LogisticRegression(max_iter=1000)

# Define the cross-validation method, e.g., Stratified K-Fold with 5 folds
cv_method = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric you want to use, e.g., accuracy
scoring_metric = make_scorer(accuracy_score)

# Perform cross-validation and get the accuracy scores
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv_method, scoring=scoring_metric)

# Print the accuracy scores for each fold
print("Cross-Validation Scores:", cv_scores)

# Calculate the mean accuracy across all folds
mean_accuracy = cv_scores.mean()
print("Mean Accuracy:", mean_accuracy)


Cross-Validation Scores: [0.85585017 0.85721801 0.85900673 0.86069024 0.86058502]
Mean Accuracy: 0.8586700336700337


**Observation:**
- CV scores are consistent
- Mean accuracy looks fair
- No significant difference between the training accuracy (0.8624) and CV mean accuracy (0.8587)

### HyperParameter Tuning (Logistic Regression)

In [None]:
preprocessing = Pipeline([
    ('scaler', StandardScaler()),
    # You can add other preprocessing steps here, if needed
])
model = LogisticRegression()

In [20]:
# Define the hyperparameters and their possible values to search:
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization penalty (L1 or L2)
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Inverse of regularization strength
    'solver': ['liblinear', 'lbfgs', 'saga'],  # Optimization algorithm
    'max_iter': [100, 500, 1000],  # Maximum number of iterations for optimization
}

In [21]:
grid_search = GridSearchCV(
    estimator=Pipeline([
        ('preprocessing', preprocessing),
        ('model', model)
    ]),
    param_grid=param_grid,
    scoring='accuracy',  # Use the accuracy metric for evaluation
    cv=cv_method,         # Use the same cross-validation method as before
    verbose=1,            # Increase verbosity for progress updates
    n_jobs=-1             # Use all available CPU cores
)


In [22]:
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 126 candidates, totalling 630 fits


In [None]:
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

---

## 2) Ensemble Models

### 2.1) Random Forest

In [8]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", classification_report_str)


Accuracy: 0.9651515151515152

Confusion Matrix:
 [[4453   12  100]
 [  37  659  167]
 [  69   29 6354]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4565
           1       0.94      0.76      0.84       863
           2       0.96      0.98      0.97      6452

    accuracy                           0.97     11880
   macro avg       0.96      0.91      0.93     11880
weighted avg       0.96      0.97      0.96     11880



### Cross Validation (Random Forest Model)

In [None]:
# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define the cross-validation method (Stratified K-Fold with 5 folds)
cv_method = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric (accuracy)
scoring_metric = make_scorer(accuracy_score)

# Perform cross-validation and get the accuracy scores
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=cv_method, scoring=scoring_metric)

# Print the accuracy scores for each fold
print("Cross-Validation Scores (Random Forest):", cv_scores_rf)

# Calculate the mean accuracy across all folds
mean_accuracy_rf = cv_scores_rf.mean()
print("Mean Accuracy (Random Forest):", mean_accuracy_rf)


### Hyperparameter Tuning (Random Forest)

In [None]:
# Define the parameter grid with hyperparameters to search
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Create the Grid Search object
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, 
                               scoring='accuracy', cv=cv_method, n_jobs=-1)

# Perform the Grid Search
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters (Random Forest):", best_params_rf)

# Get the best cross-validated accuracy score
best_score_rf = grid_search_rf.best_score_
print("Best Cross-Validated Accuracy Score (Random Forest):", best_score_rf)

---

### 2.2) Gradient Boosting Classifier

In [9]:
# # Create an instance of the Gradient Boosting Classifier
# gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# # Fit the model to your training data
# gb_classifier.fit(X_train, y_train)

# # Make predictions on your test data
# y_pred = gb_classifier.predict(X_test)

# # Evaluate the model's performance
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# accuracy = accuracy_score(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

# # Print the results
# print("Accuracy:", accuracy)
# print("Confusion Matrix:\n", conf_matrix)
# print("Classification Report:\n", classification_rep)


Accuracy: 0.9741582491582491
Confusion Matrix:
 [[4442   18  105]
 [  23  762   78]
 [  68   15 6369]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      4565
           1       0.96      0.88      0.92       863
           2       0.97      0.99      0.98      6452

    accuracy                           0.97     11880
   macro avg       0.97      0.95      0.96     11880
weighted avg       0.97      0.97      0.97     11880



Lets try a few more ensembles

In [12]:
# AdaBoostClassifier
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train)
adaboost_predictions = adaboost_model.predict(X_test)

# Evaluate AdaBoostClassifier
print("\nAdaBoostClassifier Results:")
print("Accuracy:", accuracy_score(y_test, adaboost_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, adaboost_predictions))
print("Classification Report:\n", classification_report(y_test, adaboost_predictions))


AdaBoostClassifier Results:
Accuracy: 0.9606902356902357
Confusion Matrix:
 [[4411   22  132]
 [  66  700   97]
 [ 122   28 6302]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96      4565
           1       0.93      0.81      0.87       863
           2       0.96      0.98      0.97      6452

    accuracy                           0.96     11880
   macro avg       0.95      0.92      0.93     11880
weighted avg       0.96      0.96      0.96     11880



In [15]:
# XGBClassifier
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# Evaluate XGBClassifier
print("\nXGBClassifier Results:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_predictions))
print("Classification Report:\n", classification_report(y_test, xgb_predictions))


XGBClassifier Results:
Accuracy: 0.9792929292929293
Confusion Matrix:
 [[4474   10   81]
 [  14  776   73]
 [  50   18 6384]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      4565
           1       0.97      0.90      0.93       863
           2       0.98      0.99      0.98      6452

    accuracy                           0.98     11880
   macro avg       0.98      0.96      0.97     11880
weighted avg       0.98      0.98      0.98     11880

