In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [3]:
data = pd.read_csv('act_dataset.csv')

In [3]:
data.head()

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,cid
0,948,2,48,89.8128,0,0,0,100,0,0,...,0,1,0,1,0,422,477,566,324,0
1,1002,3,61,49.4424,0,0,0,90,0,1,...,1,3,0,1,0,162,218,392,564,1
2,961,3,45,88.452,0,1,1,90,0,1,...,1,3,0,1,1,326,274,2063,1893,0
3,1166,3,47,85.2768,0,1,0,100,0,1,...,1,3,0,1,0,287,394,1590,966,0
4,1090,0,43,66.6792,0,1,0,100,0,1,...,1,3,0,0,0,504,353,870,782,0


In [4]:
data.isnull().sum()

time       0
trt        0
age        0
wtkg       0
hemo       0
homo       0
drugs      0
karnof     0
oprior     0
z30        0
zprior     0
preanti    0
race       0
gender     0
str2       0
strat      0
symptom    0
treat      0
offtrt     0
cd40       0
cd420      0
cd80       0
cd820      0
cid        0
dtype: int64

In [14]:
# Step 2: Apply Stepwise Forward Selection algorithm
X = data.drop(columns=['cid'])  # Adjust 'target_column' with your target variable
y = data['cid']

In [15]:
from time import time
tic_fwd = time()
rf = RandomForestClassifier().fit(X, y)
sfs_forward = SequentialFeatureSelector(rf, n_features_to_select='auto', direction='forward').fit(X, y)
toc_fwd = time()
print('Time taken for forward selection: ', toc_fwd - tic_fwd, 'seconds')

# Step 4: Print the features that were selected
selected_features_forward = X.columns[sfs_forward.get_support()]
print('Selected features using forward selection: ', selected_features_forward)



Time taken for forward selection:  307.04339933395386 seconds
Selected features using forward selection:  Index(['time', 'wtkg', 'hemo', 'oprior', 'zprior', 'str2', 'strat', 'treat',
       'offtrt', 'cd40', 'cd420'],
      dtype='object')


In [16]:
#save reduced dataset to csv file
reduced_data = data[selected_features_forward]
reduced_data.to_csv('sfs-reduced_act_dataset.csv', index=False)

In [20]:
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(reduced_data, y, test_size=0.2, random_state=42)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Step 5: Train various classifiers on both datasets
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

In [24]:
import numpy as np
# Step 6: Evaluate the performance for both original and reduced datsets of classifiers

for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train_orig, y_train_orig)
    # Convert the array to a contiguous array
    X_test_orig = np.ascontiguousarray(X_test_orig)
    y_pred_orig = classifier.predict(X_test_orig)
    accuracy_orig = accuracy_score(y_test_orig, y_pred_orig)
    precision_orig = precision_score(y_test_orig, y_pred_orig, average='weighted')
    recall_orig = recall_score(y_test_orig, y_pred_orig, average='weighted')
    f1_orig = f1_score(y_test_orig, y_pred_orig, average='weighted')
    print('Classifier: ', classifier_name)
    print('Accuracy (original): ', accuracy_orig)
    print('Precision (original): ', precision_orig)
    print('Recall (original): ', recall_orig)
    print('F1 (original): ', f1_orig)

    classifier.fit(X_train_reduced, y_train_reduced)
    y_pred_reduced = classifier.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test_reduced, y_pred_reduced)
    precision_reduced = precision_score(y_test_reduced, y_pred_reduced, average='weighted')
    recall_reduced = recall_score(y_test_reduced, y_pred_reduced, average='weighted')
    f1_reduced = f1_score(y_test_reduced, y_pred_reduced, average='weighted')
    print('Accuracy (reduced): ', accuracy_reduced)
    print('Precision (reduced): ', precision_reduced)
    print('Recall (reduced): ', recall_reduced)
    print('F1 (reduced): ', f1_reduced)
    print('---------------------------------')



Classifier:  RandomForest
Accuracy (original):  0.8785046728971962
Precision (original):  0.8756950720264413
Recall (original):  0.8785046728971962
F1 (original):  0.8705735213931372
Accuracy (reduced):  0.8714953271028038
Precision (reduced):  0.8664600664133374
Recall (reduced):  0.8714953271028038
F1 (reduced):  0.8654223114019032
---------------------------------
Classifier:  SVM
Accuracy (original):  0.8457943925233645
Precision (original):  0.8375111417048353
Recall (original):  0.8457943925233645
F1 (original):  0.8373522539857065
Accuracy (reduced):  0.8598130841121495
Precision (reduced):  0.8552298626799207
Recall (reduced):  0.8598130841121495
F1 (reduced):  0.8566490740617647
---------------------------------




Classifier:  KNN
Accuracy (original):  0.8271028037383178
Precision (original):  0.8158892929977768
Recall (original):  0.8271028037383178
F1 (original):  0.8167407443810092
Accuracy (reduced):  0.8551401869158879
Precision (reduced):  0.8524396999064977
Recall (reduced):  0.8551401869158879
F1 (reduced):  0.853581398574723
---------------------------------
