In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
features_df = pd.read_csv('data/training_set_features.csv', index_col='id')
labels_df = pd.read_csv('data/training_set_labels.csv', index_col='id')
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

In [3]:
numeric_cols = [
    'amount_tsh', 
    'gps_height', 
    'longitude', 
    'latitude', 
    'num_private',
    'region_code', 
    'district_code', 
    'population', 
    'construction_year'
]

categorical_cols_with_rare_label = [
    'basin',
    'extraction_type', 
    'extraction_type_group',
    'extraction_type_class', 
    'management', 
    'management_group',
    'payment', 
    'payment_type',
    'water_quality', 
    'quality_group',
    'source', 
    'source_type',
    'source_class', 
    'waterpoint_type', 
    'waterpoint_type_group'
]

categorical_cols_without_rare_label = [
    'date_recorded',
    'funder',
    'installer',
    'wpt_name',
    'subvillage',
    'scheme_management',
    'region',
    'lga',
    'ward',
    'public_meeting',
    'recorded_by',
    'scheme_name', 
    'permit', 
    'quantity', 
    'quantity_group', 
]

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from feature_engine.imputation import RandomSampleImputer
from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline 

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [5]:
missing_category_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Missing")
random_sample_imputer = RandomSampleImputer(random_state = 42)
scheme_management_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Company")
missing_indicator_imputer = MissingIndicator(error_on_new=True, features='missing-only')

imputer_categorical = ColumnTransformer(transformers=[
    ('missing_category', missing_category_imputer, ["funder", "installer", "scheme_name"]),
    ('random_sample', random_sample_imputer, ["public_meeting", "subvillage"]),
    ('scheme_management_imputer', scheme_management_imputer, ["scheme_management"]),
    ('missing_indicator_imputer', missing_indicator_imputer, ["permit"]),
], remainder="passthrough")

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('standard_scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer_with_rare_label = Pipeline(steps=[
    ('rare', RareLabelEncoder(tol=0.01, n_categories=2, replace_with='RareCategory')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_without_rare_label = Pipeline(steps=[
    ('cat_imputer', imputer_categorical),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_cols),
    ('cat_rare', categorical_transformer_with_rare_label, categorical_cols_with_rare_label),
    ('cat_not_rare', categorical_transformer_without_rare_label, categorical_cols_without_rare_label)
], remainder="drop")

over_sampler = ADASYN(sampling_strategy="minority", random_state=42)

# model = CatBoostClassifier(n_estimators=1000, learning_rate=0.05, rsm=1, random_strength=1, silent=False, loss_function='MultiClass',  random_state=42)

model = RandomForestClassifier(random_state=42, n_estimators = 500)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampler', over_sampler),
    ('model', model)
])

In [6]:
%%time

scores = cross_val_score(pipeline, features_df, labels_df["status_group"], cv=5, scoring='accuracy')
print(scores)

[0.81304714 0.80639731 0.80942761 0.80976431 0.80639731]
Wall time: 9min


In [7]:
scores.mean()

0.809006734006734

In [8]:
test_features_df = pd.read_csv("data/test_set_features.csv", index_col="id")

In [9]:
%%time

# Train model
pipeline.fit(features_df, labels_df["status_group"])

test_preds = pipeline.predict(test_features_df)

Wall time: 2min 19s


In [10]:
test_preds

array(['non functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [11]:
submission_df = pd.read_csv("data/submission_format.csv", index_col="id")

In [12]:
np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)

In [13]:
submission_df["status_group"] = test_preds

In [14]:
submission_df.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,non functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [15]:
submission_df.to_csv('submissions/V6.csv', index=True)