<a href="https://colab.research.google.com/github/StevenWestmoreland/DS-Unit-2-Kaggle-Challenge/blob/master/DSPT6_U2_S2_KaggleContest_DecisionTreeAssignment_StevenWestmoreland.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# setup
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'



In [68]:
# Load data

import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.merge(pd.read_csv(DATA_PATH+'waterpumps/train_features.csv'), 
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')

# Split train into train & val
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=55)

train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

In [69]:
train['status_group'].value_counts(normalize=True)

functional                 0.543077
non functional             0.384238
functional needs repair    0.072685
Name: status_group, dtype: float64

In [70]:
train[['longitude', 'latitude']].describe()

Unnamed: 0,longitude,latitude
count,47520.0,47520.0
mean,34.088321,-5.708571
std,6.550276,2.946063
min,0.0,-11.64944
25%,33.095164,-8.540622
50%,34.911251,-5.015265
75%,37.173991,-3.327294
max,40.345193,-2e-08


In [0]:
import numpy as np

def wrangle(X):
    """Wrangle train, validate, and test sets in the same way"""
    
    # Prevent SettingWithCopyWarning
    X = X.copy()
    
    # Latitude has small values near zero, treat these values like zero.
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we will replace the zeros with nulls, and impute missing values later.
    cols_with_zeros = ['longitude', 'latitude']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
            
    # quantity & quantity_group are duplicates, so drop one.
    # same with extraction_type and extration_type_group, and payment/payment_type.
    # source and source_type are close enough that we can also choose only one of them.
    # recorded_by is constant, so drop it as well.
    # region and region_code represent the same thing, just categorically or
    # numerically respectively. We can lose one.
    # num_private is 98.8% zeros. Let's drop that too
    X = X.drop(columns=['quantity_group','extraction_type_group',
                        'payment_type','source_type','recorded_by','region_code',
                        'num_private'])
    
    # Extract year from date_recorded
    X['year_recorded'] = pd.to_datetime(X['date_recorded']).dt.year

    # Determine age of water pump
    X['age'] = X['year_recorded'] - X['construction_year']

    # Return the wrangled dataframe
    return X


train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [73]:
train.shape, val.shape, test.shape

((47520, 36), (11880, 36), (14358, 35))

In [74]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target & id
train_features = train.drop(columns=[target, 'id'])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features
print(features)

['amount_tsh', 'gps_height', 'longitude', 'latitude', 'district_code', 'population', 'construction_year', 'year_recorded', 'age', 'basin', 'region', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_class', 'management', 'management_group', 'payment', 'water_quality', 'quality_group', 'quantity', 'source', 'source_class', 'waterpoint_type', 'waterpoint_type_group']


In [0]:
# Set data as features matrices and target vectors 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [76]:
# Create a pipeline for encoding/imputing/scaling/fitting Decision Tree

import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

tree_pipe = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                         SimpleImputer(), StandardScaler(),
                         DecisionTreeClassifier(min_samples_leaf=4,random_state=55)) # Submissions: 1st:leaf=4, 2nd:leaf=6, 3rd:leaf=2, 4th:leaf=5, returned to leaf=4

tree_pipe.fit(X_train, y_train)
print(f'Train accuracy: {tree_pipe.score(X_train, y_train)}')
print(f'Validation accuracy: {tree_pipe.score(X_val, y_val)}')

Train accuracy: 0.8903198653198653
Validation accuracy: 0.765993265993266


In [77]:
# predict target of test features

y_pred = tree_pipe.predict(X_test)
y_pred

array(['functional', 'functional needs repair', 'functional', ...,
       'functional', 'functional', 'non functional'], dtype=object)

In [0]:
# arrange submission dataframe and .csv file

sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('SubmissionStevenWestmoreland7.csv', index=False)

from google.colab import files
files.download('SubmissionStevenWestmoreland7.csv')