# Tanzanian Ministry of Water Dataset Modeling

**Import libraries**

In [778]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import plotly.express as px


**Import datasets**

In [761]:
y = pd.read_csv('dependent_vars.csv')
X = pd.read_csv('independent_vars.csv')
X_test = pd.read_csv('independent_test.csv')
SF = pd.read_csv('SubmissionFormat.csv')

### Creating a Baseline

In [762]:
y['status_group'].value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

### Beginning modeling pipline

**Create training, validation, and final test datasets**

In [763]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42)

**Creating a function to handel preprocessing because sklearn is annoying me**

In [764]:
def data_preprocesser(X, y):
    # Transforming Target
    y.drop('id', axis=1, inplace=True)
    le = LabelEncoder()
    y = le.fit_transform(y)
    y = pd.DataFrame(y, columns=['status_group'])

    # Transfroming Features
    drop_features = ['extraction_type', 'extraction_type_group', 'waterpoint_type_group', 
                     'source', 'source_type', 'quantity_group', 'water_quality', 'payment_type', 
                     'management', 'region', 'district_code', 'num_private', 'wpt_name', 'ward', 
                     'recorded_by', 'funder', 'installer', 'subvillage', 'scheme_management', 'scheme_name']
    X.drop(drop_features, axis=1, inplace=True)

    # revealing the nan values
    X.replace(0, np.nan, inplace=True)
    X.replace(-2.000000e-08, np.nan, inplace=True)
    X.replace('unknown', np.nan, inplace=True)

    # Impoting numeric features
    numeric_features = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 
                    'region_code', 'population', 'public_meeting', 'permit', 'construction_year']

    imputer = KNNImputer(n_neighbors=2)
    X[numeric_features] = imputer.fit_transform(X[numeric_features])


    # Imputing Categorical variables
    categorical_features = ['basin', 'lga', 'extraction_type_class', 'management_group', 
                            'payment', 'quality_group', 'quantity', 'source_class', 'waterpoint_type']

    # Label encoding with a trick to keep nan values
    X[categorical_features] = X[categorical_features].apply(lambda series: pd.Series(
            LabelEncoder().fit_transform(series[series.notnull()]),
            index=series[series.notnull()].index
            ))


    imputer = IterativeImputer()
    X[categorical_features] = imputer.fit_transform(X[categorical_features])


    # Feature Engineering DateTime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_rec'] = X['date_recorded'].dt.month
    X['day_rec'] = X['date_recorded'].dt.day

    days_in_a_month = 31 # can potentially be done better (28, 30, 31)
    months_in_a_year = 12

    # Sin
    X['sin_day'] = np.sin((X.day_rec-1)*(2*np.pi/days_in_a_month))
    X['sin_month'] = np.sin((X.month_rec-1)*(2*np.pi/months_in_a_year))

    # Cosine
    X['cos_day'] = np.cos((X.day_rec-1)*(2*np.pi/days_in_a_month))
    X['cos_month'] = np.cos((X.month_rec-1)*(2*np.pi/months_in_a_year))

    # Engineering years in service
    X['years_in_service'] = X['year_recorded'] - X['construction_year']

    # Dropping unneeded features
    X.drop(['id'], axis=1, inplace=True)
    X.drop('date_recorded', axis=1, inplace=True)
    X.drop('construction_year', axis=1, inplace=True)
    X.drop('year_recorded', axis=1, inplace=True)
    X.drop('month_rec', axis=1, inplace=True)
    X.drop('day_rec', axis=1, inplace=True)


    return X, y

**Creating pipeline**

**Processing Data for modeling**

In [None]:
X_train, y_train = data_preprocesser(X_train, y_train)
X_val, y_val = data_preprocesser(X_val, y_val)

In [782]:
trees_rand = RandomForestClassifier(n_estimators=100)
trees_rand.fit(X_train, y_train)
print("Random forest model score: %.3f" % trees_rand.score(X_val, y_val))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Random forest model score: 0.804


In [774]:
rfdtmportances = pd.DataFrame(trees_rand.feature_importances_, X_train.columns, columns=['value'])

In [776]:
rfdtmportances = pd.DataFrame(trees_rand.feature_importances_, X_train.columns, columns=['value'])

rf_importances = pd.DataFrame(trees_rand.feature_importances_, X_train.columns, columns=['value'])

rf_importances.reset_index(inplace=True)

rf_importances = rf_importances.sort_values(by='value', ascending=True)

fig = px.bar(y=rf_importances['index'], x=rf_importances['value'], width=600, height=1000, title="Random Forest Feature Importance")
fig.update_xaxes(range=[0, 0.5])
fig.show()

In [781]:
xg_classifier = XGBClassifier(n_estimators=1000, random_state=42, eval_metric='merror')

xg_classifier.fit(X_train,y_train)

print('Validation Accuracy: Adaboost', xg_classifier.score(X_val, y_val))

Validation Accuracy: Adaboost 0.7901683501683502
