# Baseline model

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder,StandardScaler , RobustScaler , LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, classification_report ,plot_confusion_matrix , confusion_matrix
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline



# set seaborn display
sns.set_theme(context='talk', style='white')

#### load clean data set

    load test and clean feature data set

In [2]:
features =pd.read_csv('../Data/clean_data.csv' , index_col=0)
test_data = pd.read_csv('../Data/Test_set_values.csv')

In [3]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50142 entries, 0 to 59399
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             16558 non-null  float64
 1   funder                 50142 non-null  object 
 2   gps_height             34219 non-null  float64
 3   longitude              50142 non-null  float64
 4   latitude               50142 non-null  float64
 5   num_private            718 non-null    float64
 6   basin                  50142 non-null  object 
 7   region_code            50142 non-null  int64  
 8   district_code          50142 non-null  int64  
 9   lga                    50142 non-null  object 
 10  population             33264 non-null  float64
 11  public_meeting         50142 non-null  bool   
 12  permit                 50142 non-null  bool   
 13  extraction_type        50142 non-null  object 
 14  extraction_type_class  50142 non-null  object 
 15  ma

In [4]:
features.status_group.value_counts(normalize=True)

functional                 0.547146
non functional             0.386622
functional needs repair    0.066232
Name: status_group, dtype: float64

### Target
our target column is status_group
- 0 Funcional
- 1 Functional , needs repair
- 2 non functional

Class imbalance is observed, where 'functional' and 'non-functional' are balanced, but the 'functional needs repair' category constitutes only 6.5%

In [5]:
#casting the data type of region and district code to object , they needed to be catagorical
features['region_code'] = features['region_code'].astype('O')
features['district_code'] = features['district_code'].astype('O')

In [6]:
#numerical and catagorical columns
cat_columns=list(features.select_dtypes(include='O').columns)
num_columns=list(features.select_dtypes(exclude='O').columns)

In [7]:
cat_columns.remove('status_group')

In [8]:
len(cat_columns)

14

In [9]:
len(num_columns)

9

In [10]:
# create features and target
X = features.drop('status_group', axis=1)
y = features.status_group

encoder = LabelEncoder()
y_dummies = encoder.fit_transform(y)


# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_dummies, test_size=0.2, random_state=42)

NameError: name 'LabelEncoder' is not defined

In [None]:
y_dummies

### Column Transform

In [None]:
num_pipeline = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='median' ,add_indicator=True)),
    ('ss', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore',sparse=True))
])

In [None]:
col_trans=ColumnTransformer(transformers=[
    ('nums', num_pipeline, num_columns),
    ('cats', cat_pipeline, cat_columns)
], remainder='passthrough', n_jobs=-1)

In [None]:
np.bincount(y_train)

In [None]:
dummy_pipeline = Pipeline([
    ('col_trans', col_trans),
    ('dummyC', DummyClassifier())
])

dummy_pipeline.fit(X_train, y_train)

In [None]:
dummy_pipeline.score(X_train , y_train) , dummy_pipeline.score(X_test , y_test)

In [None]:
y_pred_train = dummy_pipeline.predict(X_train)
cm = confusion_matrix(y_train, y_pred_train)

# Display the confusion matrix using ConfusionMatrixDisplay
fig, ax = plt.subplots(figsize=(10, 10))
plt.title('Dummy Model Confusion Matrix', fontsize=20)

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dummy_pipeline.classes_).plot(ax=ax)
plt.show()

### Logistic regression

In [None]:
# first simple model
logreg_pipe = Pipeline([
    ('colt', col_trans),
    ('logreg', LogisticRegression(C=0.1 ,n_jobs=-1))
])

logreg_pipe.fit(X_train, y_train)

In [None]:
logreg_pipe.score(X_train , y_train)

In [None]:
y_pred_train = logreg_pipe.predict(X_train)
cm = confusion_matrix(y_train, y_pred_train)

# Display the confusion matrix using ConfusionMatrixDisplay
fig, ax = plt.subplots(figsize=(10, 10))
plt.title('Dummy Model Confusion Matrix', fontsize=20)

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dummy_pipeline.classes_).plot(ax=ax)
plt.show()

#### handling imbalance

In [345]:
smote_pipeline = Pipeline([
    ('colt', col_trans),
    ('logreg', LogisticRegression(C=0.1,n_jobs=-1))
])

smote_pipeline.fit(X_train, y_train)

Pipeline(steps=[('colt',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('nums',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'num_private', 'population',
                                                   'public_meeting', 'permit',
                                                   'age_waterpoint']),
                

In [344]:
smote_pipe.score(X_train, y_train), smote_pipe.score(X_test, y_test)

(0.6772866651708922, 0.6762389071692093)

In [352]:
logreg_pipe2 = Pipeline([
    ('colt', col_trans),
    ('logreg', LogisticRegression(class_weight='balanced' ,C=0.1 ,n_jobs=-1))
])

logreg_pipe2.fit(X_train, y_train)

Pipeline(steps=[('colt',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('nums',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'num_private', 'population',
                                                   'public_meeting', 'permit',
                                                   'age_waterpoint']),
                

In [357]:
scores = cross_val_score(logreg_pipe2, X_train, y_train, cv=5, scoring='accuracy')

In [355]:
scores.mean()

0.6636006248925389

In [360]:
logreg_pipe2.score(X_train , y_train)

0.6744197641662304

In [359]:
scores2

array([nan, nan, nan, nan, nan])

In [362]:
logreg_pipe3 = Pipeline([
    ('colt', col_trans),
    ('logreg', RandomForestClassifier(n_estimators=100, random_state=42))
])

logreg_pipe3.fit(X_train, y_train)

Pipeline(steps=[('colt',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('nums',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'num_private', 'population',
                                                   'public_meeting', 'permit',
                                                   'age_waterpoint']),
                

In [366]:
logreg_pipe3.score(X_train , y_train) , logreg_pipe3.score(X_test , y_test)

(0.999875352130232, 0.8005783228636952)

In [369]:
colt = logreg_pipe3.named_steps['colt']
feature_names = colt.transformers_[0][2]

In [370]:
rf_classifier = logreg_pipe3.named_steps['logreg']
feature_importances = rf_classifier.feature_importances_

In [373]:
print("Feature Importances:")
for feature, importance in zip(feature_names, feature_importances):
    print(f"{feature}: {importance*100}")

Feature Importances:
amount_tsh: 1.6583314040544004
gps_height: 6.565780833400662
longitude: 12.257763464523139
latitude: 12.108806641256724
num_private: 0.11178770513310299
population: 4.622998427546373
public_meeting: 0.6064478655131131
permit: 0.7122565230129779
age_waterpoint: 4.716826568605567


In [374]:
scores = cross_val_score(logreg_pipe3, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

Cross-validated Accuracy Scores: [0.79882837 0.80381403 0.80294154 0.80204438 0.80354026]
Mean Accuracy: 0.8022337176777323
