In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import r2_score
import category_encoders as ce

In [2]:
numeric_features = ['C_VEHS', 'A_DAGE', 'A_PERS', 'A_VAGE']
categorical_features = ['C_MNTH', 'C_WDAY', 'A_CHUR', 'C_CONF', 'C_RCFG', 'C_WTHR','C_RSUR','C_RALN','C_TRAF','V_TYPE','A_DSEX','P_SAFE']

dtypes = {}
for feature in numeric_features:
    dtypes[feature] = 'float' 
for feature in categorical_features:
    dtypes[feature] = 'str' 

data = pd.read_csv('crash_transformed.csv',dtype=dtypes,)
data[categorical_features] = data[categorical_features].astype('category')

In [3]:
data.dtypes.to_dict()

{'C_MNTH': CategoricalDtype(categories=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
                   '11', '12'],
                  ordered=False),
 'C_WDAY': CategoricalDtype(categories=['1', '2', '3', '4', '5', '6', '7'], ordered=False),
 'A_CHUR': CategoricalDtype(categories=['0', '1', '2', '3'], ordered=False),
 'C_SEV': dtype('int64'),
 'C_VEHS': dtype('float64'),
 'C_CONF': CategoricalDtype(categories=['01', '02', '03', '04', '05', '06', '21', '22', '23', '24',
                   '25', '31', '32', '33', '34', '35', '36', '41', '42'],
                  ordered=False),
 'C_RCFG': CategoricalDtype(categories=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
                   '13'],
                  ordered=False),
 'C_WTHR': CategoricalDtype(categories=['1', '2', '3', '4', '5', '6', '7', '8'], ordered=False),
 'C_RSUR': CategoricalDtype(categories=['1', '10', '2', '3', '4', '5', '6', '7', '8', '9'], ordered=False),
 'C_RALN': CategoricalDtype(categorie

In [4]:
X = data.drop('C_SEV',axis=1)
y = data['C_SEV']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

one_hot_categories = ['C_MNTH', 'C_WDAY', 'A_CHUR', 'A_DSEX']
cat_boost_categories = ['C_CONF', 'C_RCFG', 'C_WTHR','C_RSUR','C_RALN','C_TRAF','V_TYPE','P_SAFE']

categorical_transformer_one_hot = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='most_frequent'))
      ,('encoder', OneHotEncoder())])

categorical_transformer_cat_boost = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='most_frequent'))
      ,('encoder', ce.CatBoostEncoder())])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical_one_hot', categorical_transformer_one_hot, one_hot_categories),
        ('categorical_cat_booster', categorical_transformer_cat_boost, cat_boost_categories)])

In [8]:
classifiers = [SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear',n_jobs=-1), threshold = 0.08),
               RandomForestClassifier(n_jobs=-1)]

In [None]:
for classifier in classifiers:
    pipe = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier',classifier)])
    pipe.fit(X_train, y_train)   
    print("model score: %.3f" % pipe.score(X_test, y_test))