In [1]:
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.merge(pd.read_csv(DATA_PATH+'../waterpumps/train_features.csv'), 
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')
#Split train into train and val
train,val = train_test_split(train, train_size = 0.80, test_size = 0.20,stratify=train['status_group'],random_state=45)

train.shape, val.shape,test.shape

FileNotFoundError: [Errno 2] File b'../data/../waterpumps/train_features.csv' does not exist: b'../data/../waterpumps/train_features.csv'

###Data Wrangling

In [None]:
import numpy as np

def wrangle(X):
    """Wrangle train, validate, and test sets in the same way"""
    #Prevent SettingWithCopyWarning
    X = X.copy()
    
    
    About 3% of the time, latitude has small values near zero,
    outside Tanzania, so we'll treat these values like zero.
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we will replace the zeros with nulls, and impute missing values later.
    cols_with_zeros = ['longitude', 'latitude']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
            
  
    #creating column year 
    X['year'] = pd.DatetimeIndex(X['date_recorded']).year
    
    #creating a new new column years between inspection
    X['construction_year'] = X['construction_year'].replace({0:2010})
    X['yrs_bw_insp'] = X['year'] - X['construction_year']  
    
    
    #to avoid error later,otherwise later when creating new features a positive will be divided by 0
    X['population'] = X['population'].replace({0:1}) 
    X['amount_tsh'] = X['amount_tsh'].replace({0:1})
    X['yrs_bw_insp'] = X['yrs_bw_insp'].replace({0:1})
    X['popu_per_year'] = X['population'] / X['yrs_bw_insp']
    X['water_per_person'] = X['amount_tsh']/X['population']
    
    
    # quantity & quantity_group are duplicates, so drop one
    X = X.drop(columns=['quantity_group'])
    
    #frequency encoding of imp features
    ae = X.groupby('lga').size()/len(X)
    X['lga'] = X['lga'].map(ae)

    be = X.groupby('waterpoint_type').size()/len(X)
    X['waterpoint_type'] = X['waterpoint_type'].map(ae)

    ce = X.groupby('extraction_type_class').size()/len(X)
    X['extraction_type_class'] = X['extraction_type_class'].map(ce)

    de =X.groupby('scheme_name').size()/len(X)
    X['scheme_name'] = X['scheme_name'].map(de)

    ee = X.groupby('funder').size()/len(X)
    X['funder'] = X['funder'].map(ee)

    fe =X.groupby('installer').size()/len(X)
    X['installer'] = X['installer'].map(fe)

    
    # return the wrangled dataframe
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

    

    

In [None]:
# The status_group column is the target
target = 'status_group'

X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [None]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


pipeline = make_pipeline(
    ce.ordinal.OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(random_state = 42,n_estimators=100,max_depth=25,min_samples_leaf=6,n_jobs=-1)
)

pipeline.fit(X_train, y_train)
print('Train Accuracy', pipeline.score(X_train, y_train))

#score on val

print('Validation Accuracy', pipeline.score(X_val, y_val))

Train Accuracy 0.8635311447811448
Validation Accuracy 0.8034511784511784


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

pipeline = make_pipeline(
      
       ce.ordinal.OrdinalEncoder(),
       SimpleImputer(strategy='median'),
       StandardScaler(),
       xgb.XGBClassifier(random_state=42,max_depth=10,learning_rate=0.1,colsample_bytree=0.9,n_estimators=100,n_jobs=-1)
      )

#fit on train
pipeline.fit(X_train,y_train)

#score on train
#should have been only X_train
print('Train Accuracy', pipeline.score(X_train, y_train))

#score on val
#should have been only X_val
print('Validation Accuracy', pipeline.score(X_val, y_val))

Train Accuracy 0.8945917508417508
Validation Accuracy 0.8031986531986532


In [None]:
#from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn. ensemble import  VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

encoder =  ce.OneHotEncoder(use_cat_names=True)
imputer = SimpleImputer()
scaler = StandardScaler()


xb = xgb.XGBClassifier(random_state=42,max_depth=10,learning_rate=0.1,colsample_bytree=0.9,n_estimators=100,n_jobs=-1)
gb = GradientBoostingClassifier(random_state=42,n_estimators=100,min_samples_leaf=10, max_depth=15)
rf = RandomForestClassifier(random_state = 42,n_estimators=100,max_depth=25,min_samples_leaf=6,n_jobs=-1)
evc = VotingClassifier( estimators= [('xb',xb),('gb',gb),('rf',rf)], voting = 'soft')

X_train_encoded = encoder.fit_transform(X_train)
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_train_scaled = scaler.fit_transform(X_train_imputed)

evc.fit(X_train_scaled, y_train)

X_val_encoded = encoder.transform(X_val)
X_val_imputed = imputer.transform(X_val_encoded)
X_val_scaled = scaler.transform(X_val_imputed)
print('Train Accuracy',evc.score(X_train_scaled,y_train))
print('Validation Accuracy', evc.score(X_val_scaled, y_val))

X_test_encoded = encoder.transform(X_test)
X_test_imputed = imputer.transform(X_test_encoded)
X_test_scaled = scaler.transform(X_test_imputed)
#y_pred = model.predict(X_test_scaled)