In [176]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC

In [177]:
df = pd.read_csv('dataset_traffic_accident_prediction1.csv')

In [178]:
df = df.dropna(subset=['Accident'])

In [179]:
x = df.drop(columns='Accident')
y = df['Accident']

In [180]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.8,random_state=42)

In [181]:
num_cols = xtrain.select_dtypes(include='number').columns
cat_cols = xtrain.select_dtypes(include='object').columns

In [182]:
num_preprocessor = Pipeline(
    steps=[
        ('num_impute',SimpleImputer(strategy="mean")),
        ('scaler',StandardScaler())
    ]
)
    
cat_preprocessor = Pipeline(
    steps=[
        ('cat_impute',SimpleImputer(strategy="constant",fill_value="unknown")),
        ('encoder',OneHotEncoder(sparse_output=False))
    ]
)


In [183]:
# step 2
preprocessor = ColumnTransformer(
    transformers=[
        ('num_preprocessor',num_preprocessor,num_cols),
        ('cat_preprocessor',cat_preprocessor,cat_cols)
    ]
)

In [184]:
# step 1
pipeline = Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',SVC(C=1.0,kernel="sigmoid",gamma="scale"))
    ]
)

In [185]:
pipeline.fit(xtrain,ytrain)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_preprocessor', ...), ('cat_preprocessor', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'sigmoid'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [186]:
pipeline.score(xtrain,ytrain)

0.609717868338558

In [187]:
pipeline.score(xtest,ytest)

0.64375

In [188]:
params = {
    'model__C': [0.01,0.1,1,10,100],
    'model__kernel': ['rbf','sigmoid']
}

In [None]:
gridsearch = GridSearchCV(pipeline,params)
gridsearch.fit(xtrain,ytrain)

0,1,2
,estimator,Pipeline(step...='sigmoid'))])
,param_grid,"{'model__C': [0.01, 0.1, ...], 'model__kernel': ['rbf', 'sigmoid']}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num_preprocessor', ...), ('cat_preprocessor', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,0.01
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [190]:
gridsearch.best_params_

{'model__C': 0.01, 'model__kernel': 'rbf'}

In [191]:
gridsearch.best_estimator_

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_preprocessor', ...), ('cat_preprocessor', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,0.01
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [193]:
gridsearch.best_score_

np.float64(0.7006274606299213)