In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
features_df = pd.read_csv('data/training_set_features.csv', index_col='id')
labels_df = pd.read_csv('data/training_set_labels.csv', index_col='id')
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

In [3]:
import math

def new_features(df):
    df["polar"] = (df["longitude"]**2) + (df["latitude"]**2)
    df["polar"] = np.sqrt(df["polar"])
    
    df["angle"] = np.arctan(df["latitude"]/df["longitude"])
    
    df["date_recorded"] = pd.to_datetime(df["date_recorded"])
    df["year_recorded"] = df["date_recorded"].dt.year
    df["year_difference"] = df["year_recorded"] - df["construction_year"]
    
new_features(features_df)

In [4]:
numeric_cols = [
    'amount_tsh', 
    'gps_height', 
    'longitude', 
    'latitude', 
    'num_private',
    'region_code', 
    'district_code', 
    'population', 
    'construction_year',
    'polar',
    'angle',
    'year_recorded',
    'year_difference'
]

categorical_cols_one_hot = [
    'basin',
    'extraction_type', 
    'extraction_type_group',
    'extraction_type_class', 
    'management', 
    'management_group',
    'payment', 
    'payment_type',
    'water_quality', 
    'quality_group',
    'source', 
    'source_type',
    'source_class', 
    'waterpoint_type', 
    'waterpoint_type_group'
]

categorical_cols_ordinal = [
    'date_recorded',
    'funder',
    'installer',
    'wpt_name',
    'subvillage',
    'scheme_management',
    'region',
    'lga',
    'ward',
    'public_meeting',
    'recorded_by',
    'scheme_name', 
    'permit', 
    'quantity', 
    'quantity_group', 
]

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from feature_engine.imputation import RandomSampleImputer
from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [6]:
missing_category_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Missing")
random_sample_imputer = RandomSampleImputer(random_state = 42)
scheme_management_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Company")
missing_indicator_imputer = MissingIndicator(error_on_new=True, features='missing-only')

imputer_categorical = ColumnTransformer(transformers=[
    ('missing_category', missing_category_imputer, ["funder", "installer", "scheme_name"]),
    ('random_sample', random_sample_imputer, ["public_meeting", "subvillage"]),
    ('scheme_management_imputer', scheme_management_imputer, ["scheme_management"]),
    ('missing_indicator_imputer', missing_indicator_imputer, ["permit"]),
], remainder="passthrough")

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('standard_scaler', StandardScaler()), # 0.8101
    #('minmax_scaler', MinMaxScaler()) # 0.8097
])

# Preprocessing for categorical data
categorical_transformer_one_hot = Pipeline(steps=[
    ('rare_one_hot', RareLabelEncoder(tol=0.01, n_categories=2, replace_with='RareCategory')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_ordinal = Pipeline(steps=[
    ('cat_imputer', imputer_categorical),
    #('rare_ordinal', RareLabelEncoder(tol=0.001, n_categories=2, replace_with='RareCategory')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_cols),
    ('cat_rare', categorical_transformer_one_hot, categorical_cols_one_hot),
    ('cat_not_rare', categorical_transformer_ordinal, categorical_cols_ordinal)
], remainder="drop")

over_sampler = ADASYN(sampling_strategy="minority", random_state=42)

#model = RandomForestClassifier(random_state=42, n_estimators=500, bootstrap=False)
model = RandomForestClassifier(random_state=42, n_estimators=1500, max_features='auto', max_depth=60, min_samples_split=6, min_samples_leaf=3, bootstrap=False)

pipeline_preprocessing = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

pipeline = Pipeline(steps=[
    ('sampler', over_sampler),
    ('pca', PCA()), # score reduced
    ('model', model)
])

In [7]:
# importance = model.feature_importances_
# indices = np.argsort(importance)[-20:]
# indices

# most important 20 features obtained are [115, 110,   1, 106,  71,  62,  24, 103,  34, 104,  42,   8, 123, 12,   3,   2,  10,   9, 129, 130]

In [8]:
transformed_df = pd.DataFrame(pipeline_preprocessing.fit_transform(features_df, labels_df["status_group"]))

In [9]:
new_df = transformed_df[[115, 110,   1, 106,  71,  62,  24, 103,  34, 104,  42,   8, 123,
        12,   3,   2,  10,   9, 129, 130]].copy()
new_df

Unnamed: 0,115,110,1,106,71,62,24,103,34,104,42,8,123,12,3,2,10,9,129,130
0,0.0,0.0,1.041252,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.733857,47.0,-0.735088,-1.408791,0.131052,-0.268732,0.246153,1.0,1.0
1,0.0,0.0,1.054237,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.745416,309.0,-0.744549,1.207934,0.094610,0.573740,0.017266,2.0,2.0
2,0.0,0.0,0.025541,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.744365,300.0,-0.743498,0.639751,0.515158,0.416245,0.447737,1.0,1.0
3,0.0,0.0,-0.584751,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.720196,272.0,-0.719320,-1.849720,0.671308,-0.297026,0.807496,0.0,0.0
4,0.0,0.0,-0.964200,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,-1.366788,104.0,1.366307,1.317271,-0.448669,0.586554,-0.516124,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,0.0,0.0,0.781553,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.733857,338.0,-0.732986,0.832379,0.470870,0.472923,0.396629,1.0,1.0
59396,0.0,0.0,0.784439,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.730704,90.0,-0.731934,-1.142092,0.178544,-0.177416,0.260538,1.0,1.0
59397,0.0,0.0,-0.964200,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.366788,75.0,1.366307,-1.033404,-0.009188,-0.177096,0.070797,1.0,1.0
59398,0.0,0.0,-0.964200,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.366788,41.0,1.366307,-0.228290,0.271629,0.122305,0.264387,2.0,2.0


In [10]:
# %%time

# scores = cross_val_score(pipeline, new_df, labels_df["status_group"], cv=2, scoring='accuracy')
# print(scores)

In [11]:
# scores.mean()

In [12]:
test_features_df = pd.read_csv("data/test_set_features.csv", index_col="id")

In [13]:
new_features(test_features_df)

In [14]:
transformed_test_features_df = pd.DataFrame(pipeline_preprocessing.transform(test_features_df))

In [15]:
new_test_features = transformed_test_features_df[[115, 110,   1, 106,  71,  62,  24, 103,  34, 104,  42,   8, 123,
        12,   3,   2,  10,   9, 129, 130]].copy()
new_test_features

Unnamed: 0,115,110,1,106,71,62,24,103,34,104,42,8,123,12,3,2,10,9,129,130
0,1.0,1.0,1.915572,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.747518,279.0,-0.746651,0.558839,0.184758,0.365359,0.130232,3.0,3.0
1,0.0,0.0,1.299508,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.734908,279.0,-0.734037,0.813586,0.392742,0.462195,0.321230,2.0,2.0
2,1.0,1.0,1.296623,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.745416,276.0,-0.744549,0.238184,0.105131,0.253023,0.071042,2.0,2.0
3,1.0,1.0,-0.578980,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.721247,266.0,-0.720371,-1.260233,0.606120,-0.140849,0.678790,0.0,0.0
4,0.0,0.0,0.853692,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.734908,330.0,-0.734037,-1.780173,0.141411,-0.380225,0.302328,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14845,0.0,0.0,-0.915146,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.722298,29.0,-0.723525,-0.297627,0.727116,0.154675,0.708609,1.0,1.0
14846,0.0,0.0,-0.964200,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.728603,54.0,-0.729832,0.120708,0.513783,0.257181,0.474077,2.0,2.0
14847,0.0,0.0,1.165330,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.745416,307.0,-0.744549,0.380329,0.100859,0.299325,0.058376,2.0,2.0
14848,0.0,0.0,0.475686,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.744365,293.0,-0.743498,-1.655850,0.206369,-0.329250,0.347147,2.0,2.0


In [16]:
%%time

# Train model
pipeline.fit(new_df, labels_df["status_group"])

test_preds = pipeline.predict(new_test_features)

Wall time: 11min 45s


In [17]:
test_preds

array(['non functional', 'functional needs repair', 'non functional', ...,
       'functional', 'functional', 'non functional'], dtype=object)

In [18]:
submission_df = pd.read_csv("data/submission_format.csv", index_col="id")

In [19]:
np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)

In [20]:
submission_df["status_group"] = test_preds

In [21]:
submission_df.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,non functional
51630,functional needs repair
17168,non functional
45559,non functional
49871,functional


In [22]:
submission_df.to_csv('submissions/V8.csv', index=True)