In [1]:
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.merge(pd.read_csv(DATA_PATH+'waterpumps/train_features.csv'), 
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')
#Split train into train and val
train,val = train_test_split(train, train_size = 0.80, test_size = 0.20,stratify=train['status_group'],random_state=45)

train.shape, val.shape,test.shape

FileNotFoundError: [Errno 2] File b'../data/waterpumps/train_features.csv' does not exist: b'../data/waterpumps/train_features.csv'

###Data Wrangling

In [None]:
import numpy as np

def wrangle(X):
    """Wrangle train, validate, and test sets in the same way"""
    
    # Prevent SettingWithCopyWarning
    X = X.copy()
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these values like zero.
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we will replace the zeros with nulls, and impute missing values later.
    # Also create a "missing indicator" column, because the fact that
    # values are missing may be a predictive signal.
    cols_with_zeros = ['longitude', 'latitude', 'construction_year', 
                       'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col+'_MISSING'] = X[col].isnull()
            
    # Drop duplicate columns
    duplicates = ['quantity_group', 'payment_type']
    X = X.drop(columns=duplicates)
    
    # Drop recorded_by (never varies) and id (always varies, random)
    unusable_variance = ['recorded_by', 'id']
    X = X.drop(columns=unusable_variance)
    
    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']
    X['years_MISSING'] = X['years'].isnull()
     # return the wrangled dataframe
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)
    

In [None]:
train.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,longitude_MISSING,latitude_MISSING,construction_year_MISSING,gps_height_MISSING,population_MISSING,year_recorded,month_recorded,day_recorded,years,years_MISSING
49821,20.0,Wua,313.0,WU,38.329813,-6.148961,Mzee Malongwe,0,Wami / Ruvu,Komkwazu,Pwani,6,1,Bagamoyo,Mbwewe,80.0,True,WUA,Chalinze wate,True,2008.0,ksb,submersible,submersible,wua,user-group,pay per bucket,soft,good,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional,False,False,False,False,False,2011,3,7,3.0,False
1158,0.0,World Vision,,DWE,31.816212,-1.216423,Karobora,0,Lake Victoria,Kaija,Kagera,18,2,Bukoba Rural,Buhendangabo,,True,VWC,,True,,gravity,gravity,gravity,vwc,user-group,never pay,soft,good,seasonal,spring,spring,groundwater,improved spring,improved spring,non functional,False,False,True,True,True,2011,7,29,,True
34468,0.0,Government Of Tanzania,1325.0,District council,37.623486,-3.134127,Kwa Damian,0,Pangani,Umarini,Kilimanjaro,3,1,Rombo,Kisale Msangara,1.0,True,Company,Kisale Kitale water supply,True,1977.0,gravity,gravity,gravity,company,commercial,pay monthly,soft,good,enough,spring,spring,groundwater,communal standpipe multiple,communal standpipe,non functional,False,False,False,False,False,2013,2,18,36.0,False
12617,250.0,Dwe,1130.0,TCRS Kibondo,30.595831,-4.502873,Kwa Johnbosco,0,Lake Tanganyika,Songambele,Kigoma,16,2,Kasulu,Kagera Nkanda,750.0,True,Water authority,,False,2009.0,india mark ii,india mark ii,handpump,vwc,user-group,pay monthly,salty,salty,enough,machine dbh,borehole,groundwater,hand pump,hand pump,non functional,False,False,False,False,False,2013,1,19,4.0,False
36123,0.0,Si,,SI,31.601649,-1.757962,Kwa Revelian,0,Lake Victoria,Kashenshero,Kagera,18,3,Muleba,Kashasha,,True,VWC,,True,,ksb,submersible,submersible,vwc,user-group,never pay,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,non functional,False,False,True,True,True,2011,7,31,,True


In [None]:
# The status_group column is the target
target = 'status_group'

X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [None]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


pipeline = make_pipeline(
    ce.ordinal.OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(random_state = 42,n_estimators=100,max_depth=25,min_samples_leaf=6,n_jobs=-1)
)

pipeline.fit(X_train, y_train)
print('Train Accuracy', pipeline.score(X_train, y_train))

#score on val

print('Validation Accuracy', pipeline.score(X_val, y_val))

Train Accuracy 0.8635311447811448
Validation Accuracy 0.8034511784511784


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

pipeline = make_pipeline(
      
       ce.ordinal.OrdinalEncoder(),
       SimpleImputer(strategy='median'),
       StandardScaler(),
       xgb.XGBClassifier(random_state=42,max_depth=10,learning_rate=0.1,colsample_bytree=0.9,n_estimators=100,n_jobs=-1)
      )

#fit on train
pipeline.fit(X_train,y_train)

#score on train
#should have been only X_train
print('Train Accuracy', pipeline.score(X_train, y_train))

#score on val
#should have been only X_val
print('Validation Accuracy', pipeline.score(X_val, y_val))

Train Accuracy 0.8945917508417508
Validation Accuracy 0.8031986531986532


In [None]:
#from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn. ensemble import  VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

encoder =  ce.OneHotEncoder(use_cat_names=True)
imputer = SimpleImputer()
scaler = StandardScaler()


xb = xgb.XGBClassifier(random_state=42,max_depth=10,learning_rate=0.1,colsample_bytree=0.9,n_estimators=100,n_jobs=-1)
gb = GradientBoostingClassifier(random_state=42,n_estimators=100,min_samples_leaf=10, max_depth=15)
rf = RandomForestClassifier(random_state = 42,n_estimators=100,max_depth=25,min_samples_leaf=6,n_jobs=-1)
evc = VotingClassifier( estimators= [('xb',xb),('gb',gb),('rf',rf)], voting = 'soft')

X_train_encoded = encoder.fit_transform(X_train)
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_train_scaled = scaler.fit_transform(X_train_imputed)

evc.fit(X_train_scaled, y_train)

X_val_encoded = encoder.transform(X_val)
X_val_imputed = imputer.transform(X_val_encoded)
X_val_scaled = scaler.transform(X_val_imputed)
print('Train Accuracy',evc.score(X_train_scaled,y_train))
print('Validation Accuracy', evc.score(X_val_scaled, y_val))

X_test_encoded = encoder.transform(X_test)
X_test_imputed = imputer.transform(X_test_encoded)
X_test_scaled = scaler.transform(X_test_imputed)
#y_pred = model.predict(X_test_scaled)