# Importing Libraries

In [1]:
### Importing Libraries ###
import pandas as pd                                    # data science essentials
import numpy as np
import matplotlib.pyplot as plt                        # data visualization
import seaborn as sns                                  # enhanced data visualization
import statsmodels.formula.api as smf   
import statsmodels.api as sm                           # linear regression (statsmodels)
from sklearn.model_selection import train_test_split   # train/test split
import sklearn.linear_model                            # different linear models
import random as rand                                  # random number generation
from sklearn.neighbors import KNeighborsRegressor      # KNN for Regression
from sklearn.neighbors import KNeighborsClassifier     # KNN for classification
from sklearn.preprocessing import StandardScaler       # standard scaler
from sklearn.model_selection import RandomizedSearchCV # hyperparameter tuning
from sklearn.linear_model import LogisticRegression    # logistic regression
from sklearn.metrics import roc_auc_score              # auc score
from sklearn.metrics import confusion_matrix           # confusion matrix
from sklearn.metrics import make_scorer                # customizable scorer
from sklearn.tree import DecisionTreeClassifier        # classification trees
from sklearn.tree import export_graphviz               # exports graphics
from sklearn.ensemble import RandomForestClassifier    # random forest
from sklearn.ensemble import GradientBoostingClassifier# gbm
import time



In [2]:
### Set up pandas settings and data loading ###

start_time = time.time()

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

# Setting Data Types for the files
# Train and Test
dtypes_01 = {'id'            : str,
             'region_code'   : str,
             'district_code' : str}
# Expected Output
dtypes_02 = {'id' : str}

# Import CSV Files
raw_train_data_path = 'https://raw.githubusercontent.com/RLanderosR/Pump_it_UP_Data_Mining_the_Water_Table_DrivenData/main/Train_Data.csv'
raw_train_outp_path = 'https://raw.githubusercontent.com/RLanderosR/Pump_it_UP_Data_Mining_the_Water_Table_DrivenData/main/Train_Outcome.csv'
raw_test_data_path  = 'https://raw.githubusercontent.com/RLanderosR/Pump_it_UP_Data_Mining_the_Water_Table_DrivenData/main/Test_Data.csv'

t_data   = pd.read_csv(raw_train_data_path,
                       dtype = dtypes_01)

out_data = pd.read_csv(raw_train_outp_path,
                       dtype = dtypes_02)

test     = pd.read_csv(raw_test_data_path,
                       dtype = dtypes_01)

# Merging the Test data with the expected Outcome
train = pd.merge(t_data,out_data,on='id')

# Setting the Column 'date_recorded' as datetime
train['date_recorded'] = pd.to_datetime(train['date_recorded'])
test['date_recorded']  = pd.to_datetime(test['date_recorded'])

# Save id for test dataset
test_id = test['id']


In [3]:
# # preparing explanatory variable data
# train_base_data   = train.drop(['id', 'amount_tsh', 'date_recorded', 'funder', 
#                            'gps_height', 'installer', 'wpt_name', 
#                            'num_private', 'basin', 'subvillage', 'region', 
#                            'region_code', 'district_code', 'lga', 'ward', 
#                            'recorded_by', 'scheme_management', 'scheme_name', 
#                            'permit', 'construction_year'],
#                            axis = 1)

# # preparing explanatory variable data
# train_data   = train_base_data.drop(['status_group'],
#                            axis = 1)

# preparing explanatory variable data
train_base_data   = train.drop(['id', 'date_recorded', 'funder', 'gps_height',
                                'installer', 'longitude', 'latitude', 
                                'wpt_name', 'subvillage', 'ward'],
                           axis = 1)

# preparing explanatory variable data
train_data   = train_base_data.drop(['status_group'],
                           axis = 1)


# Functional vs Non Functional

In [4]:
# Data Preparation
train['predict'] = 0

for index, value in train.iterrows():    

    # loop to get the unique choice
    if train.loc[index, 'status_group'] == 'non functional':
        train.loc[index, 'predict'] = 1 
        
# preparing response variables
train_target = train.loc[ : , 'predict']

# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            train_data,
            train_target,
            test_size = 0.33,
            random_state = 219)

In [5]:
for col in train_data:
    print(col,'+')

amount_tsh +
num_private +
basin +
region +
region_code +
district_code +
lga +
population +
public_meeting +
recorded_by +
scheme_management +
scheme_name +
permit +
construction_year +
extraction_type +
extraction_type_group +
extraction_type_class +
management +
management_group +
payment +
payment_type +
water_quality +
quality_group +
quantity +
quantity_group +
source +
source_type +
source_class +
waterpoint_type +
waterpoint_type_group +


In [6]:
# # merging X_train and y_train so that they can be used in statsmodels
# train_t = pd.concat([x_train, y_train], axis = 1)


# # Step 1: build a model
# lm_best = smf.ols(formula =  """predict ~ longitude +
#                                     latitude +
#                                     population +
#                                     public_meeting +
#                                     extraction_type +
#                                     extraction_type_group +
#                                     extraction_type_class +
#                                     management +
#                                     management_group +
#                                     payment +
#                                     payment_type +
#                                     water_quality +
#                                     quality_group +
#                                     quantity +
#                                     quantity_group +
#                                     source +
#                                     source_type +
#                                     source_class +
#                                     waterpoint_type +
#                                     waterpoint_type_group""",
#                                 data = train_t)


# # Step 2: fit the model based on the data
# results = lm_best.fit()


# # Step 3: analyze the summary output
# print(results.summary())

# merging X_train and y_train so that they can be used in statsmodels
train_t = pd.concat([x_train, y_train], axis = 1)


# Step 1: build a model
lm_best = smf.ols(formula =  """predict ~ amount_tsh +
                                    num_private +
                                    basin +
                                    region +
                                    region_code +
                                    district_code +
                                    lga +
                                    population +
                                    public_meeting +
                                    recorded_by +
                                    scheme_management +
                                    scheme_name +
                                    permit +
                                    construction_year +
                                    extraction_type +
                                    extraction_type_group +
                                    extraction_type_class +
                                    management +
                                    management_group +
                                    payment +
                                    payment_type +
                                    water_quality +
                                    quality_group +
                                    quantity +
                                    quantity_group +
                                    source +
                                    source_type +
                                    source_class +
                                    waterpoint_type +
                                    waterpoint_type_group""",
                                data = train_t)


# Step 2: fit the model based on the data
results = lm_best.fit()


# Step 3: analyze the summary output
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                predict   R-squared:                       0.628
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     11.27
Date:                Thu, 06 May 2021   Prob (F-statistic):               0.00
Time:                        20:28:13   Log-Likelihood:                -3511.6
No. Observations:               18945   AIC:                         1.197e+04
Df Residuals:                   16474   BIC:                         3.136e+04
Df Model:                        2470                                         
Covariance Type:            nonrobust                                         
                                                                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------

In [None]:
train_cols = pd.get_dummies(train_data, prefix = 'd_')

train_cols.shape

In [None]:
test_cols   = test.drop(['id'], axis = 1)

test_cols = pd.get_dummies(test_cols, prefix = 'd_')

test_cols.shape

In [None]:
train_data.nunique()

In [None]:
test.nunique()

In [None]:
NF_data = pd.get_dummies(train_data, prefix = 'd_')

NF_data = NF_data.drop( columns = 'd__other - mkulima/shinyanga', axis = 1)

# preparing response variables
NF_target = train.loc[ : , 'predict']

# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            NF_data,
            NF_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# declaring a hyperparameter space
learn_space      = np.arange(0.1, 1.3, 0.1)
estimator_space  = np.arange(20, 100, 10)
depth_space      = np.arange(5, 9 , 1)
warm_start_space = [True, False]

# creating a hyperparameter grid
param_grid = {'learning_rate'    : learn_space,
              'max_depth'        : depth_space,
              'n_estimators'     : estimator_space,
              'warm_start'       : warm_start_space}

# INSTANTIATING the model object without hyperparameters
NF_full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
NF_full_gbm_cv = RandomizedSearchCV(estimator  = NF_full_gbm_grid,
                           param_distributions = param_grid,
                           cv                  = 3,
                           n_iter              = 10,
                           random_state        = 219,
                           scoring             = make_scorer(roc_auc_score,
                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
NF_full_gbm_cv.fit(NF_data, NF_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", NF_full_gbm_cv.best_params_)
print("Tuned Training AUC:", NF_full_gbm_cv.best_score_.round(4))

# checking the best estimator for the model
NF_full_gbm_cv.best_estimator_

# INSTANTIATING with best_estimator
NF_gbm_tuned = NF_full_gbm_cv.best_estimator_


# FIT step not needed


# PREDICTING based on the testing set
NF_gbm_tuned_pred = NF_gbm_tuned.predict(x_test)

# SCORING the results
NF_gbm_tuned_train_acc = NF_gbm_tuned.score(x_train, y_train).round(4)
NF_gbm_tuned_test_acc  = NF_gbm_tuned.score(x_test, y_test).round(4)
NF_gbm_tuned_auc  = roc_auc_score(y_true  = y_test,
                                          y_score = NF_gbm_tuned_pred).round(4)
NF_gbm_tuned_test_gap = abs(NF_gbm_tuned_train_acc-NF_gbm_tuned_test_acc).round(4)

# SCORING the results
print('Training ACCURACY:', NF_gbm_tuned_train_acc)
print('Testing  ACCURACY:', NF_gbm_tuned_test_acc)
print('Train-Test Gap   :', NF_gbm_tuned_test_gap)
print('AUC Score        :', NF_gbm_tuned_auc)

# unpacking the confusion matrix
NF_gbm_tuned_tn, \
NF_gbm_tuned_fp, \
NF_gbm_tuned_fn, \
NF_gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = NF_gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {NF_gbm_tuned_tn}
False Positives: {NF_gbm_tuned_fp}
False Negatives: {NF_gbm_tuned_fn}
True Positives : {NF_gbm_tuned_tp}
""")


# Functional vs Needs Repair

In [None]:
# Data Preparation
train_base_data['predict'] = 0

for index, value in train_base_data.iterrows():    

    # loop to get the unique choice
    if train_base_data.loc[index, 'status_group'] == 'functional needs repair':
        train_base_data.loc[index, 'predict'] = 1 

# preparing explanatory variable data
FNR_data   = train_base_data.drop(['status_group', 'predict'],
                           axis = 1)

FNR_data = pd.get_dummies(FNR_data, prefix = 'd_')

FNR_data = FNR_data.drop( columns = 'd__other - mkulima/shinyanga', axis = 1)

# preparing response variables
FNR_target = train_base_data.loc[ : , 'predict']

# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            FNR_data,
            FNR_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# declaring a hyperparameter space
learn_space      = np.arange(0.1, 1.3, 0.1)
estimator_space  = np.arange(20, 100, 10)
depth_space      = np.arange(5, 9 , 1)
warm_start_space = [True, False]

# creating a hyperparameter grid
param_grid = {'learning_rate'    : learn_space,
              'max_depth'        : depth_space,
              'n_estimators'     : estimator_space,
              'warm_start'       : warm_start_space}

# INSTANTIATING the model object without hyperparameters
FNR_full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
FNR_full_gbm_cv = RandomizedSearchCV(estimator = FNR_full_gbm_grid,
                           param_distributions = param_grid,
                           cv                  = 3,
                           n_iter              = 10,
                           random_state        = 219,
                           scoring             = make_scorer(roc_auc_score,
                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
FNR_full_gbm_cv.fit(FNR_data, FNR_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", FNR_full_gbm_cv.best_params_)
print("Tuned Training AUC:", FNR_full_gbm_cv.best_score_.round(4))

# checking the best estimator for the model
FNR_full_gbm_cv.best_estimator_

# INSTANTIATING with best_estimator
FNR_gbm_tuned = FNR_full_gbm_cv.best_estimator_


# FIT step not needed


# PREDICTING based on the testing set
FNR_gbm_tuned_pred = FNR_gbm_tuned.predict(x_test)

# SCORING the results
FNR_gbm_tuned_train_acc = FNR_gbm_tuned.score(x_train, y_train).round(4)
FNR_gbm_tuned_test_acc  = FNR_gbm_tuned.score(x_test, y_test).round(4)
FNR_gbm_tuned_auc  = roc_auc_score(y_true  = y_test,
                                          y_score = FNR_gbm_tuned_pred).round(4)
FNR_gbm_tuned_test_gap = abs(FNR_gbm_tuned_train_acc-FNR_gbm_tuned_test_acc).round(4)

# SCORING the results
print('Training ACCURACY:', FNR_gbm_tuned_train_acc)
print('Testing  ACCURACY:', FNR_gbm_tuned_test_acc)
print('Train-Test Gap   :', FNR_gbm_tuned_test_gap)
print('AUC Score        :', FNR_gbm_tuned_auc)

# unpacking the confusion matrix
FNR_gbm_tuned_tn, \
FNR_gbm_tuned_fp, \
FNR_gbm_tuned_fn, \
FNR_gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = FNR_gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {FNR_gbm_tuned_tn}
False Positives: {FNR_gbm_tuned_fp}
False Negatives: {FNR_gbm_tuned_fn}
True Positives : {FNR_gbm_tuned_tp}
""")



In [None]:
# Data Preparation
train_base_data['predict'] = 0

for index, value in train_base_data.iterrows():    

    # loop to get the unique choice
    if train_base_data.loc[index, 'status_group'] == 'functional':
        train_base_data.loc[index, 'predict'] = 1 

# preparing explanatory variable data
F_data   = train_base_data.drop(['status_group', 'predict'],
                           axis = 1)

F_data = pd.get_dummies(F_data, prefix = 'd_')

F_data = F_data.drop( columns = 'd__other - mkulima/shinyanga', axis = 1)

# preparing response variables
F_target = train_base_data.loc[ : , 'predict']

# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            F_data,
            F_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# declaring a hyperparameter space
learn_space      = np.arange(0.1, 1.3, 0.1)
estimator_space  = np.arange(20, 100, 10)
depth_space      = np.arange(5, 9 , 1)
warm_start_space = [True, False]

# creating a hyperparameter grid
param_grid = {'learning_rate'    : learn_space,
              'max_depth'        : depth_space,
              'n_estimators'     : estimator_space,
              'warm_start'       : warm_start_space}

# INSTANTIATING the model object without hyperparameters
F_full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
F_full_gbm_cv = RandomizedSearchCV(estimator = F_full_gbm_grid,
                           param_distributions = param_grid,
                           cv                  = 3,
                           n_iter              = 10,
                           random_state        = 219,
                           scoring             = make_scorer(roc_auc_score,
                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
F_full_gbm_cv.fit(F_data, F_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", F_full_gbm_cv.best_params_)
print("Tuned Training AUC:", F_full_gbm_cv.best_score_.round(4))

# checking the best estimator for the model
F_full_gbm_cv.best_estimator_

# INSTANTIATING with best_estimator
F_gbm_tuned = F_full_gbm_cv.best_estimator_


# FIT step not needed


# PREDICTING based on the testing set
F_gbm_tuned_pred = F_gbm_tuned.predict(x_test)

# SCORING the results
F_gbm_tuned_train_acc = F_gbm_tuned.score(x_train, y_train).round(4)
F_gbm_tuned_test_acc  = F_gbm_tuned.score(x_test, y_test).round(4)
F_gbm_tuned_auc  = roc_auc_score(y_true  = y_test,
                                          y_score = F_gbm_tuned_pred).round(4)
F_gbm_tuned_test_gap = abs(F_gbm_tuned_train_acc-F_gbm_tuned_test_acc).round(4)

# SCORING the results
print('Training ACCURACY:', F_gbm_tuned_train_acc)
print('Testing  ACCURACY:', F_gbm_tuned_test_acc)
print('Train-Test Gap   :', F_gbm_tuned_test_gap)
print('AUC Score        :', F_gbm_tuned_auc)

# unpacking the confusion matrix
F_gbm_tuned_tn, \
F_gbm_tuned_fp, \
F_gbm_tuned_fn, \
F_gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = F_gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {F_gbm_tuned_tn}
False Positives: {F_gbm_tuned_fp}
False Negatives: {F_gbm_tuned_fn}
True Positives : {F_gbm_tuned_tp}
""")




# Results

In [None]:
NF_predictions = pd.DataFrame({'Non Funtional' : NF_gbm_tuned_pred})

FNR_predictions = pd.DataFrame({'Funtional Needs Repair' : FNR_gbm_tuned_pred})

F_predictions = pd.DataFrame({'Funtional' : F_gbm_tuned_pred})

all_predictions = pd.concat([train['id'],
                             train['status_group'],
                             NF_predictions,
                             FNR_predictions,
                             F_predictions],
                               axis = 1)

all_predictions

# Actual

In [None]:
# Preparing the data to be forcasted
df_test   = test.drop(['id', 'amount_tsh', 'date_recorded', 'funder', 
                           'gps_height', 'installer', 'wpt_name', 
                           'num_private', 'basin', 'subvillage', 'region', 
                           'region_code', 'district_code', 'lga', 'ward', 
                           'recorded_by', 'scheme_management', 'scheme_name', 
                           'permit', 'construction_year'],
                           axis = 1)

df_test = pd.get_dummies(df_test, prefix = 'd_')

NF_pred  = NF_gbm_tuned.predict(df_test)

FNR_pred = FNR_gbm_tuned.predict(df_test)

F_pred   = F_gbm_tuned.predict(df_test)

data = {'id' : test_id,
        'NF_pred'  : NF_pred,
        'FNR_pred' : FNR_pred,
        'F_pred'   : F_pred,
       }

df_response = pd.DataFrame(data)

In [None]:
df_response

In [None]:
# Creating flags to enable data exploration
df_response['all_true'] = 0 

for index, value in df_response.iterrows():    

    # loop to get the unique choice
    if  df_response.loc[index, 'NF_pred'] == 1 & \
        df_response.loc[index, 'FNR_pred'] == 1 & \
        df_response.loc[index, 'F_pred'] == 1:
        df_response.loc[index, 'all_true'] = 1
        
df_response['NF_FNR'] = 0 

for index, value in df_response.iterrows():    

    # loop to get the unique choice
    if  df_response.loc[index, 'NF_pred'] == 1 & \
        df_response.loc[index, 'FNR_pred'] == 1:
        df_response.loc[index, 'NF_FNR'] = 1
        
df_response['NF_F'] = 0 

for index, value in df_response.iterrows():    

    # loop to get the unique choice
    if  df_response.loc[index, 'NF_pred'] == 1 & \
        df_response.loc[index, 'F_pred'] == 1:
        df_response.loc[index, 'NF_F'] = 1
        
df_response['FNR_F'] = 0 

for index, value in df_response.iterrows():    

    # loop to get the unique choice
    if  df_response.loc[index, 'FNR_pred'] == 1 & \
        df_response.loc[index, 'F_pred'] == 1:
        df_response.loc[index, 'FNR_F'] = 1

In [None]:
df_response['all_true'][df_response['all_true'] == 1]

In [None]:
## 0.7902 Score Submission

# Creating a copy to explore the data
explore = df_response.copy()

# Creating column to translate the binaries into the actual statuses

explore['status_group'] = 'functional'

for index, value in explore.iterrows():    

    # loop to get the unique choice
    if   explore.loc[index, 'NF_pred'] == 1:
        explore.loc[index, 'status_group'] = 'non functional'
        
    elif explore.loc[index, 'FNR_pred'] == 1:
        explore.loc[index, 'status_group'] = 'functional needs repair'
        
        
# Managing the values that overlap with the other GBM
for index, value in explore.iterrows():    

    # All True then Pump is FUNCTIONAL
    if   explore.loc[index, 'all_true'] == 1:
        explore.loc[index, 'status_group'] = 'functional'
        
    # Non Functional and Funcitonal Needs Repair is Funcitonal Needs Repair
    elif explore.loc[index, 'NF_FNR'] == 1:
        explore.loc[index, 'status_group'] = 'functional needs repair'
        
    # Non Functional and Funcitonal is NON Funcitonal
    elif explore.loc[index, 'NF_F'] == 1:
        explore.loc[index, 'status_group'] = 'non functional'
        
    # Funcitonal Needs Repair Functional and Funcitonal
    elif explore.loc[index, 'FNR_F'] == 1:
        explore.loc[index, 'status_group'] = 'functional'

        
sumbission_test = explore.copy()

sumbission_test = sumbission_test.drop(columns = ['NF_pred', 'FNR_pred', 
                                                  'F_pred', 'all_true', 
                                                  'NF_FNR', 'NF_F', 'FNR_F'],
                             axis = 1)

sumbission_test.to_csv('pump_it_RLR_logic.csv', index=False)

In [None]:
## 0.7861 Score Submission


## Creating a column to translate the binaries into the actual statuses
# df_response['status_group'] = 'functional'

# for index, value in df_response.iterrows():    

#     # loop to get the unique choice
#     if   df_response.loc[index, 'NF_pred'] == 1:
#         df_response.loc[index, 'status_group'] = 'non functional'
        
#     elif df_response.loc[index, 'FNR_pred'] == 1:
#         df_response.loc[index, 'status_group'] = 'functional needs repair'
        
# sumbission = df_response.copy()

# sumbission = sumbission.drop(columns = ['NF_pred', 'FNR_pred', 'F_pred'],
#                              axis = 1)

# sumbission.to_csv('pump_it_RLR.csv', index=False)

In [None]:
Overall_Score = 