In [116]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression, LinearRegression
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import r2_score
from sklearn import svm
import category_encoders as ce

from multiprocessing import Process,Pool
import copy
import time
import os
import threading

In [117]:
numeric_features = ['C_VEHS', 'A_DAGE', 'A_PERS', 'A_VAGE']
categorical_features = ['C_MNTH', 'C_WDAY', 'A_CHUR', 'C_CONF', 'C_RCFG', 'C_WTHR','C_RSUR','C_RALN','C_TRAF','V_TYPE','A_DSEX','P_SAFE']

dtypes = {}
for feature in numeric_features:
    dtypes[feature] = 'float' 
for feature in categorical_features:
    dtypes[feature] = 'str' 

data = pd.read_csv('crash_transformed.csv',dtype=dtypes,)
#data[categorical_features] = data[categorical_features].astype('category')
#data[numeric_features] = data[numeric_features].astype('float64')

In [118]:
data.dtypes.to_dict()

{'C_MNTH': dtype('O'),
 'C_WDAY': dtype('O'),
 'A_CHUR': dtype('O'),
 'C_SEV': dtype('int64'),
 'C_VEHS': dtype('float64'),
 'C_CONF': dtype('O'),
 'C_RCFG': dtype('O'),
 'C_WTHR': dtype('O'),
 'C_RSUR': dtype('O'),
 'C_RALN': dtype('O'),
 'C_TRAF': dtype('O'),
 'V_TYPE': dtype('O'),
 'A_DSEX': dtype('O'),
 'A_DAGE': dtype('float64'),
 'P_SAFE': dtype('O'),
 'A_PERS': dtype('float64'),
 'A_VAGE': dtype('float64')}

In [119]:
X = data.drop('C_SEV',axis=1)
y = data['C_SEV']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [121]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

one_hot_categories = ['C_MNTH', 'C_WDAY', 'A_CHUR', 'A_DSEX']
cat_boost_categories = ['C_CONF', 'C_RCFG', 'C_WTHR','C_RSUR','C_RALN','C_TRAF','V_TYPE','P_SAFE']

categorical_transformer_one_hot = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

categorical_transformer_cat_boost = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', ce.CatBoostEncoder())])

In [122]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat1', categorical_transformer_one_hot, one_hot_categories),
        ('cat2', categorical_transformer_one_hot, cat_boost_categories)])

In [123]:
#SelectFromModel(, threshold = 0.08)
classifiers = [LogisticRegression(C=1, penalty='l1', solver='liblinear',n_jobs=-1),
               RandomForestClassifier(n_jobs=-1),
               LinearRegression(n_jobs=-1),
               svm.SVC()]

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [124]:
pipelines = []
        
for classifier in classifiers:
    pipelines.append(
        Pipeline(steps = [('preprocessor', preprocessor),('classifier',classifier)],memory='./cache'))


def fit_pipeline(pipe,xtrain,xtest,ytrain,ytest):
    start_time = time.time()
    print('Training started')
    pipe.fit(xtrain, ytrain)
    print(f'train duration {(time.time() - start_time)}')
    print("model score: %.3f" % pipe.score(xtest, ytest))
    with open(str(pipe.named_steps['classifier'])[:15]+'.txt', 'w') as f:
        f.write('Model '+str(pipe.named_steps['classifier']))
        f.write('\n')
        f.write(f'train duration {(time.time() - start_time)}')
        f.write('\n')
        f.write("model score: %.3f" % pipe.score(xtest, ytest))
        
for pipe in pipelines:
    fit_pipeline(pipe,X_train, X_test, y_train, y_test)
#fit_pipeline(pipelines[1],X_train, X_test, y_train, y_test)



Training started


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
  " = {}.".format(effective_n_jobs(self.n_jobs)))


train duration 1226.3494708538055


AttributeError: 'SelectFromModel' object has no attribute 'score'

In [None]:
threading = False

if threading:
    threads = []
    for pipe in pipelines:
        t = threading.Thread(target=fit_pipeline,args=pipe)
        threads.append(t)
        t.start()

In [None]:
parallelize = False

In [None]:
if parallelize:
    from nbmultitask import ProcessWithLogAndControls
    from multiprocessing import Value
    # !pip install --user nbmultitask
    tasks = []
    for pipe in pipelines:
        tasks.append(ProcessWithLogAndControls(target=fit_pipeline, args=pipe))
    tasks[0].control_panel()

