In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
features_df = pd.read_csv('data/training_set_features.csv', index_col='id')
labels_df = pd.read_csv('data/training_set_labels.csv', index_col='id')
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

In [3]:
import math

def new_features(df):
    df["polar"] = (df["longitude"]**2) + (df["latitude"]**2)
    df["polar"] = np.sqrt(df["polar"])
    
    df["angle"] = np.arctan(df["latitude"]/df["longitude"])
    
    df["date_recorded"] = pd.to_datetime(df["date_recorded"])
    df["year_recorded"] = df["date_recorded"].dt.year
    df["year_difference"] = df["year_recorded"] - df["construction_year"]
    
new_features(features_df)

In [4]:
features_df.dtypes

amount_tsh                      float64
date_recorded            datetime64[ns]
funder                           object
gps_height                        int64
installer                        object
longitude                       float64
latitude                        float64
wpt_name                         object
num_private                       int64
basin                            object
subvillage                       object
region                           object
region_code                       int64
district_code                     int64
lga                              object
ward                             object
population                        int64
public_meeting                   object
recorded_by                      object
scheme_management                object
scheme_name                      object
permit                           object
construction_year                 int64
extraction_type                  object
extraction_type_group            object


In [5]:
numeric_cols = [
    'amount_tsh', 
    'gps_height', 
    'longitude', 
    'latitude', 
    'num_private',
    'region_code', 
    'district_code', 
    'population', 
    'construction_year',
    'polar',
    'angle',
    'year_recorded',
    'year_difference'
]

categorical_cols_one_hot = [
    'basin',
    'extraction_type', 
    'extraction_type_group',
    'extraction_type_class', 
    'management', 
    'management_group',
    'payment', 
    'payment_type',
    'water_quality', 
    'quality_group',
    'source', 
    'source_type',
    'source_class', 
    'waterpoint_type', 
    'waterpoint_type_group'
]

categorical_cols_ordinal = [
    'date_recorded',
    'funder',
    'installer',
    'wpt_name',
    'subvillage',
    'scheme_management',
    'region',
    'lga',
    'ward',
    'public_meeting',
    'recorded_by',
    'scheme_name', 
    'permit', 
    'quantity', 
    'quantity_group', 
]

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from feature_engine.imputation import RandomSampleImputer
from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [7]:
missing_category_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Missing")
random_sample_imputer = RandomSampleImputer(random_state = 42)
scheme_management_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Company")
missing_indicator_imputer = MissingIndicator(error_on_new=True, features='missing-only')

imputer_categorical = ColumnTransformer(transformers=[
    ('missing_category', missing_category_imputer, ["funder", "installer", "scheme_name"]),
    ('random_sample', random_sample_imputer, ["public_meeting", "subvillage"]),
    ('scheme_management_imputer', scheme_management_imputer, ["scheme_management"]),
    ('missing_indicator_imputer', missing_indicator_imputer, ["permit"]),
], remainder="passthrough")

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('standard_scaler', StandardScaler()), # 0.8101
    #('minmax_scaler', MinMaxScaler()) # 0.8097
])

# Preprocessing for categorical data
categorical_transformer_one_hot = Pipeline(steps=[
    ('rare_one_hot', RareLabelEncoder(tol=0.01, n_categories=2, replace_with='RareCategory')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_ordinal = Pipeline(steps=[
    ('cat_imputer', imputer_categorical),
    #('rare_ordinal', RareLabelEncoder(tol=0.001, n_categories=2, replace_with='RareCategory')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_cols),
    ('cat_rare', categorical_transformer_one_hot, categorical_cols_one_hot),
    ('cat_not_rare', categorical_transformer_ordinal, categorical_cols_ordinal)
], remainder="drop")

over_sampler = ADASYN(sampling_strategy="minority", random_state=42)

# model = CatBoostClassifier(n_estimators=1000, learning_rate=0.05, rsm=1, random_strength=1, silent=False, loss_function='MultiClass',  random_state=42)

# model = RandomForestClassifier(random_state=42, n_estimators = 500)
model = RandomForestClassifier(random_state=42, n_estimators=1500, max_features='auto', max_depth=60, min_samples_split=6, min_samples_leaf=3, bootstrap=False)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampler', over_sampler),
    #('pca', PCA()), # score reduced
    ('model', model)
])

In [8]:
%%time

scores = cross_val_score(pipeline, features_df, labels_df["status_group"], cv=5, scoring='accuracy')
print(scores)

[0.82045455 0.81186869 0.81473064 0.80934343 0.81279461]
Wall time: 35min 13s


In [9]:
scores.mean()

0.8138383838383838

In [18]:
test_features_df = pd.read_csv("data/test_set_features.csv", index_col="id")

In [19]:
new_features(test_features_df)

In [20]:
test_features_df

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,polar,angle,year_recorded,year_difference
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other,35.523536,-0.114532,2013,1
51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,36.805777,-0.090032,2013,13
17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other,35.126169,-0.142954,2013,3
45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other,39.206202,-0.242607,2013,26
49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,36.678880,-0.303171,2013,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39307,0.0,2011-02-24,Danida,34,Da,38.852669,-6.582841,Kwambwezi,0,Wami / Ruvu,Yombo,Pwani,6,1,Bagamoyo,Yombo,20,True,GeoData Consultants Ltd,VWC,Bagamoyo wate,True,1988,mono,mono,motorpump,vwc,user-group,never pay,never pay,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,39.406391,-0.167837,2011,23
18990,1000.0,2011-03-21,Hiap,0,HIAP,37.451633,-5.350428,Bonde La Mkondoa,0,Pangani,Mkondoa,Tanga,4,7,Kilindi,Mvungwe,2960,True,GeoData Consultants Ltd,VWC,,False,1994,nira/tanira,nira/tanira,handpump,vwc,user-group,pay annually,annually,salty,salty,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,37.831890,-0.141902,2011,17
28749,0.0,2013-03-04,,1476,,34.739804,-4.585587,Bwawani,0,Internal,Juhudi,Singida,13,2,Singida Rural,Ughandi,200,True,GeoData Consultants Ltd,VWC,,,2010,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,dam,dam,surface,communal standpipe,communal standpipe,35.041142,-0.131239,2013,3
33492,0.0,2013-02-18,Germany,998,DWE,35.432732,-10.584159,Kwa John,0,Lake Nyasa,Namakinga B,Ruvuma,10,2,Songea Rural,Maposeni,150,True,GeoData Consultants Ltd,VWC,Mradi wa maji wa maposeni,True,2009,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,36.979763,-0.290274,2013,4


In [21]:
%%time

# Train model
pipeline.fit(features_df, labels_df["status_group"])

test_preds = pipeline.predict(test_features_df)

Wall time: 10min 34s


In [22]:
test_preds

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [23]:
submission_df = pd.read_csv("data/submission_format.csv", index_col="id")

In [24]:
np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)

In [25]:
submission_df["status_group"] = test_preds

In [26]:
submission_df.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [27]:
submission_df.to_csv('submissions/V7.csv', index=True)