In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
features_df = pd.read_csv('data/training_set_features.csv', index_col='id')
labels_df = pd.read_csv('data/training_set_labels.csv', index_col='id')

In [3]:
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

In [4]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
categorical_cols = features_df.columns[features_df.dtypes == "object"].values

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [6]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='mean'))
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# 0.7792592592592593 score
# model = CatBoostClassifier(n_estimators=1000, learning_rate=0.05, rsm=1, random_strength=1, silent=False, loss_function='MultiClass')

model = RandomForestClassifier()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [7]:
%%time

scores = cross_val_score(pipeline, features_df, labels_df["status_group"], cv=2, scoring='accuracy')
print(scores)

[0.80121212 0.79676768]
Wall time: 11.9 s


In [8]:
scores.mean()

0.798989898989899

In [9]:
test_features_df = pd.read_csv("data/test_set_features.csv", index_col="id")

In [10]:
%%time

# Train model
pipeline.fit(features_df, labels_df["status_group"])

test_preds = pipeline.predict(test_features_df)

Wall time: 12 s


In [11]:
test_preds

array(['non functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [12]:
submission_df = pd.read_csv("data/submission_format.csv", index_col="id")

In [13]:
np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)

In [14]:
submission_df["status_group"] = test_preds

In [15]:
submission_df.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,non functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [16]:
submission_df.to_csv('submissions/V2.csv', index=True)