<a href="https://colab.research.google.com/github/StephenTGibson/data-projects/blob/main/DrivenData_Pump_it_Up_Data_Mining_the_Water_Table_v2_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install sklearn &> /dev/null

In [2]:
import numpy as np
import pandas as pd

from time import perf_counter
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier

# use colab gpu if enabled via runtime menu
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
trainingFeaturesURL = 'https://drivendata-prod.s3.amazonaws.com/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20221117%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221117T093406Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=2e50418b20828bad0f007e69dcf4cba2997bee4998e6fde27bfdebbc221cba1b'
trainingLabelsURL = 'https://drivendata-prod.s3.amazonaws.com/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20221117%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221117T093406Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=72211e1ae74c04f3d036072c82b42a52f17b2852db74153c2101bd56fd3ac0d9'

In [4]:
df = pd.read_csv(trainingFeaturesURL)
dfLabels = pd.read_csv(trainingLabelsURL)

labelMapStrToNum = {
    'functional': 0,
    'non functional': 1,
    'functional needs repair': 2,
}
labelMapNumToStr = {
    0: 'functional',
    1: 'non functional',
    2: 'functional needs repair',
}

dfLabels['labelNum'] = dfLabels['status_group'].apply(lambda x: labelMapStrToNum[x])

df['labels'] = dfLabels['labelNum']

# Preparation

In [5]:
print(df.shape)
df.head()

(59400, 41)


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,labels
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,0
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,0
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,0
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,0


Prepare norm process

In [6]:
unusedFeatures = ['id', 'labels', 'recorded_by', 'wpt_name', 'scheme_name', 'num_private']
equivalentFeatures = []

featuresFillNa = [
    'permit',
    'public_meeting',
]

featuresFill0 = [
    'construction_year',
    'population',
]

# featuresLogTrans = [
#     'construction_year',
# ]

featuresMinMax = [
    'construction_year',
    'population',
    'amount_tsh',
    'gps_height',
]
minMaxScaler = MinMaxScaler()

featuresStandard = [
    'construction_year',
    'population',
    'amount_tsh',
    'gps_height',
]
standardScaler = StandardScaler()

featuresOneHot = [
    'water_quality',
    'region',
    'basin',
    'extraction_type',
    'scheme_management',
    'quantity',
    'waterpoint_type',
    'source',
    'payment',
]
equivalentFeatures += ['waterpoint_type_group', 'payment_type', 
                       'management_group', 'management', 'source_type', 
                       'source_class', 'quality_group', 'extraction_type_group',
                       'extraction_type_class', 'quantity_group', 'lga',
                       'region_code', 'district_code', 'ward', 'subvillage',
                       ]
oneHotEncoder = OneHotEncoder(handle_unknown='ignore')

featuresOrdinal = [
    'permit',
    'public_meeting',
]
ordinalEncoder = OrdinalEncoder()

usedFeatures = featuresMinMax + featuresStandard + featuresOneHot + featuresOrdinal
usedEquivalentFeatures = unusedFeatures + usedFeatures + equivalentFeatures

print([name for name in df.columns if name not in usedEquivalentFeatures])

['date_recorded', 'funder', 'installer', 'longitude', 'latitude']


In [7]:
def prepareData(data, train=False, test=False):

    data[featuresFillNa] = data[featuresFillNa].fillna(method='ffill')
    data[featuresFillNa] = data[featuresFillNa].fillna(method='bfill')

    data[featuresFill0] = data[featuresFill0].replace(to_replace=0, method='ffill')
    data[featuresFill0] = data[featuresFill0].replace(to_replace=0, method='bfill')

    if train:
        oneHotEncoder.fit(data[featuresOneHot])
        ordinalEncoder.fit(data[featuresOrdinal])
        minMaxScaler.fit(data[featuresMinMax])
        # standardScaler.fit(data[featuresStandard])

    oneHotArr = oneHotEncoder.transform(data[featuresOneHot]).toarray()
    ordinalEncodArr = ordinalEncoder.transform(data[featuresOrdinal])
    minMaxScaledArr = minMaxScaler.transform(data[featuresMinMax])
    # standardScaledArr = standardScaler.transform(data[featuresStandard])

    if len(ordinalEncodArr.shape) == 1:
        ordinalEncodArr = np.reshape(ordinalEncodArr, (ordinalEncodArr.shape[0], 1))
    if len(minMaxScaledArr.shape) == 1:
        minMaxScaledArr = np.reshape(minMaxScaledArr, (minMaxScaledArr.shape[0], 1))
    # if len(standardScaledArr.shape) == 1:
    #     standardScaledArr = np.reshape(standardScaledArr, (standardScaledArr.shape[0], 1))

    xArr = np.append(oneHotArr, ordinalEncodArr, axis=1)
    xArr = np.append(xArr, minMaxScaledArr, axis=1)
    # xArr = np.append(xArr, standardScaledArr, axis=1)

    if test:
        return xArr
    else:
        yArr = data['labels'].to_numpy()
        return xArr, yArr

In [11]:
df = df.sample(frac=1).reset_index(drop=True) # , random_state=123

validPerc = 0.2
testPerc = 0.2

dfValid = df.iloc[:int(df.shape[0] * validPerc)]
dfTest = df.iloc[int(df.shape[0] * validPerc):int(df.shape[0] * (validPerc + testPerc))]
dfTrain = df.iloc[int(df.shape[0] * (validPerc + testPerc)):]

In [12]:
xTrainArr, yTrainArr = prepareData(dfTrain, train=True)
xValidArr, yValidArr = prepareData(dfValid)
xTestArr, yTestArr = prepareData(dfTest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [26]:
numFeature = xTrainArr.shape[1]
numClass = dfLabels['status_group'].nunique()
print(numFeature)
print(numClass)

103
3


In [None]:
trainClassWeights = list(1 / np.unique(yTrainArr, return_counts=True)[1])
samplerWeights = [trainClassWeights[classLabel] for classLabel in yTrainArr]

# Random forest classifier

In [9]:
rf = RandomForestClassifier(criterion='gini',
                                n_estimators=1000,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

In [14]:
rf.fit(xTrainArr, yTrainArr)

RandomForestClassifier(n_estimators=500, n_jobs=-1, oob_score=True,
                       random_state=1)

In [16]:
yValidPred = rf.predict(xValidArr)

In [19]:
np.sum(yValidPred == yValidArr) / yValidArr.shape[0]

0.7732323232323233

In [20]:
yTestPred = rf.predict(xTestArr)

In [21]:
np.sum(yTestPred == yTestArr) / yTestArr.shape[0]

0.7763468013468013

# Generate submission predictions

In [23]:
testFeaturesURL = 'https://drivendata-prod.s3.amazonaws.com/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20221117%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221117T101948Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b9fe079f8a2b245fd996f00079dc8a212d2e18422306988f92451d49b1e968bc'
dfTest = pd.read_csv(testFeaturesURL)
xTestSubmArr = prepareData(dfTest, test=True)

In [24]:
dfTest['labelNum'] = rf.predict(xTestSubmArr)
dfTest['status_group'] = dfTest['labelNum'].apply(lambda num: labelMapNumToStr[num])

In [25]:
dfTest[['id', 'status_group']].to_csv(f'submission{date.today()}.csv', index=False)