In [95]:
from sklearn.model_selection import train_test_split

# models will be created using six algorithms
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

#feature select methods
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score


In [96]:
from scipy.io import arff
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler

data = arff.loadarff('project-2018-BRFSS-arthritis.arff')
df = pd.DataFrame(dataset[0])
df.head()
df = df.replace(b'?', np.nan)

In [97]:
# Create an imputer for categorical columns with strategy as 'most_frequent' (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')

# Create an imputer for numerical columns with strategy as 'mean'
num_imputer = SimpleImputer(strategy='mean')

In [98]:
# for categorical column fill the missing value with mode, for numeric value fill the missing value with mean 
for column in df.columns[:-1]:
    if df[column].dtype == 'O':
        df[column] = cat_imputer.fit_transform(df[column].values.reshape(-1, 1))
    else:
        df[column] = num_imputer.fit_transform(df[column].values.reshape(-1, 1))
        
# Replace infinity values with a large finite value
df = df.replace([np.inf, -np.inf], np.finfo(np.float64).max)

In [114]:
# drop the rows with missing value in class attribute column
df_dropped = df.dropna()

In [101]:
X = df.drop('havarth3', 1) #Fearture Matrix
y = df['havarth3']         #Target Variable        

# Scale the data to avoid values too large for float64
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

  X = df.drop('havarth3', 1) #Fearture Matrix


In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#convert the multi-label data into a binary array
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_train = y_train.argmax(axis=1)
y_test = mlb.fit_transform(y_test)
y_test = y_test.argmax(axis=1)

In [103]:
# Select 8 best features based on ANOVA F-value
selector = SelectKBest(f_classif, k=8)
X_train_new = selector.fit_transform(X_train, y_train)
X_test_new = selector.transform(X_test)

In [121]:
# Define the models
models = {
    'Naive Bayes': GaussianNB(),
    # increase the number of iterations allowed for the logistic regression model to 1000
    'Logistic': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'Neural': MLPClassifier(),
    'KNN': KNeighborsClassifier()
}

In [105]:
# Feature selection methods
feature_selection_methods = {
    'RFE': RFE(estimator=LogisticRegression(), n_features_to_select=8),
    'SelectFromModel': SelectFromModel(LogisticRegression()),
    'SelectKBest': SelectKBest(f_classif, k=8)
}

In [124]:
#create a dataframe to store the results
results = pd.DataFrame(columns=['Feature Selection Method', 'Model', 'Accuracy', 'Precision', 'Recall', 'AUC-ROC'])


# Apply feature selection methods and models
for fs_name, fs_method in feature_selection_methods.items():
    X_train_new = fs_method.fit_transform(X_train, y_train)
    X_test_new = fs_method.transform(X_test)
    
    print(f"Feature Selection Method: {fs_name}")
    for model_name, model in models.items():
        model.fit(X_train_new, y_train)
        y_pred = model.predict(X_test_new)
        y_prob = model.predict_proba(X_test_new)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=1)
        recall = recall_score(y_test, y_pred)
        auc_roc = roc_auc_score(y_test, y_prob)
        
        results = results.append({
            'Feature Selection Method': fs_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'AUC-ROC': auc_roc
        }, ignore_index=True)
        

Feature Selection Method: RFE
Feature Selection Method: SelectFromModel
Feature Selection Method: SelectKBest


In [123]:
results

Unnamed: 0,Feature Selection Method,Model,Accuracy,Precision,Recall,AUC-ROC
0,RFE,Naive Bayes,0.485255,0.753623,0.339528,0.584785
1,RFE,Logistic,0.667225,0.667225,1.0,0.56254
2,RFE,SVM,0.667225,0.667225,1.0,0.497715
3,RFE,Decision Tree,0.592828,0.701245,0.679056,0.549498
4,RFE,Neural,0.332775,1.0,0.0,0.5
5,RFE,KNN,0.595845,0.676088,0.756906,0.529722
6,SelectFromModel,Naive Bayes,0.667225,0.667225,1.0,0.568833
7,SelectFromModel,Logistic,0.667225,0.667225,1.0,0.562588
8,SelectFromModel,SVM,0.667225,0.667225,1.0,0.545103
9,SelectFromModel,Decision Tree,0.658512,0.666438,0.977398,0.536708
