import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

# Load the data
data = pd.read_csv('acc_data_sampling_50.csv')

# Split the data into training and testing sets
# Split the data into features and target variable (if applicable)
X = data.drop(columns=['accident_severity', 'time']) # Replace 'target' with the name of your target variable column.
y = data['accident_severity'] # Replace 'target' with the name of your target variable column.

# Use StratifiedShuffleSplit to split the data into training and test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(sss.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

X_train = pd.concat([X_train, pd.DataFrame({'Intercept': np.ones(len(X_train))})], axis=1)

# Initialize an empty set to store the selected features
selected_features = set()

# Create a list of remaining features
remaining_features = list(X_train.columns)

# Train a logistic regression model with only the intercept term
log_reg = LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000).fit(X_train[['Intercept']], y_train)

# Calculate the model's accuracy on the testing data
best_accuracy = log_reg.score(X_test[['Intercept']], y_test)

# Repeat until no features remain
while remaining_features:

    # Initialize variables to store the best feature and its accuracy
    best_feature = None
    feature_accuracy = 0

    # Try adding each remaining feature to the selected features
    for feature in remaining_features:

        # Train a logistic regression model with the selected features
        features = list(selected_features) + [feature]
        log_reg = LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000).fit(X_train[features], y_train)

        # Calculate the model's accuracy on the testing data
        accuracy = log_reg.score(X_test[features], y_test)

        # If the accuracy is higher than the current best accuracy, update the variables
        if accuracy > feature_accuracy:
            best_feature = feature
            feature_accuracy = accuracy

    # Add the best feature to the selected features and remove it from the remaining features
    selected_features.add(best_feature)
    remaining_features.remove(best_feature)

    # If the feature improves the accuracy, print the selected features and the new accuracy
    if feature_accuracy > best_accuracy:
        best_accuracy = feature_accuracy
        print(f'Selected features: {list(selected_features)}\nAccuracy: {best_accuracy:.4f}')


In [1]:
pip install mlxtend --user


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
data = pd.read_csv('data_process_v2.csv')


In [4]:
len(data.columns)

27

In [5]:
from sklearn.preprocessing import LabelEncoder
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
data = data.copy()
data[categorical_cols] = data[categorical_cols].apply(lambda col: le.fit_transform(col))

In [6]:
# Split the data into training and testing sets
# Split the data into features and target variable (if applicable)
X = data.drop(columns=['accident_severity', 'time']) # Replace 'target' with the name of your target variable column.
y = data['accident_severity'] # Replace 'target' with the name of your target variable column.

# Use StratifiedShuffleSplit to split the data into training and test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(sss.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [7]:
from sklearn.linear_model import SGDClassifier

In [8]:
 X.shape[1]

25

In [9]:
clf = SGDClassifier()

In [10]:
backward_feature_elimination = SequentialFeatureSelector(clf,
                                                     k_features = 10,
                                                     forward = False,
                                                     floating= False,
                                                     verbose=2,
                                                     scoring = "accuracy",
                                                     cv=5).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.0min finished

[2023-05-04 05:21:32] Features: 24/10 -- score: 0.47326337149633035[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   54.1s finished

[2023-05-04 05:22:26] Features: 23/10 -- score: 0.4840370847478953[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:   54.7s finished

[2023-05-04 05:23:21] Features: 22/10 -- score: 0.4914135816963917[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Do

In [11]:
backward_feature_elimination.k_feature_names_

('propulsion_code',
 'age_of_vehicle',
 'number_of_vehicles',
 'day_of_week',
 'road_type',
 'junction_control',
 '2nd_road_class',
 'pedestrian_crossing-human_control',
 'weather_conditions',
 'urban_or_rural_area')