In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn import svm
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
import plotly.express as px

# Load and process your data from the 'D:\CSV_files' directory
directory = 'D:\CSV_files'
autism = []
td = []

# Load and process data
for filename in os.listdir(directory):
    if filename.split(".")[0].endswith("1"):
        f = os.path.join(directory, filename)
        df = pd.read_csv(f, header=None)
        for i in range(0, 6):
            frame = df.iloc[:, 30 * i:30 * i + 30]
            result = np.asarray(frame.T.corr())
            a = list(result[np.triu_indices(result.shape[1], k=1)])
            autism.append([])
            autism[-1].extend(a)
    else:
        f = os.path.join(directory, filename)
        df = pd.read_csv(f, header=None)
        for i in range(0, 6):
            frame = df.iloc[:, 30 * i:30 * i + 30]
            result = np.asarray(frame.T.corr())
            a = list(result[np.triu_indices(result.shape[1], k=1)])
            td.append([])
            td[-1].extend(a)

# Combine all data into a single array
X = np.vstack((autism, td))
y = np.array([1] * len(autism) + [0] * len(td))  # Label 1 for autism, 0 for TD

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the classifiers
classifiers = {
    "SVM Linear": svm.SVC(kernel='linear'),
    "SVM Sigmoid": svm.SVC(kernel='sigmoid'),
    "SVM RBF": svm.SVC(kernel='rbf'),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "MLP Classifier": MLPClassifier()
}

# Lists to store accuracy and feature count for each model
accuracy_dict = {classifier_name: [] for classifier_name in classifiers.keys()}
feature_count_list = []

# Start with the top 100 features and increment by 100
max_features = min(X.shape[1], 27700)
for num_features in range(100, max_features + 1, 100):
    feature_count_list.append(num_features)
    
    for classifier_name, classifier in classifiers.items():
        # Implement 5-fold accuracy calculation
        kf = KFold(n_splits=5)
        accuracies = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Select the top k features using ANOVA F-value
            selector = SelectKBest(score_func=f_classif, k=num_features)
            X_selected = selector.fit_transform(X_train_fold, y_train_fold)

            # Train the classifier on the selected features
            classifier.fit(X_selected, y_train_fold)

            # Get the decision function scores for the validation data
            X_val_selected = selector.transform(X_val_fold)

            # Calculate accuracy
            y_pred = classifier.predict(X_val_selected)
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        # Calculate the average accuracy for the current model and number of features
        average_accuracy = np.mean(accuracies)
        accuracy_dict[classifier_name].append(average_accuracy)
        print(f'5-Fold Accuracy for {classifier_name} with {num_features} features: {average_accuracy * 100:.2f}%')


5-Fold Accuracy for SVM Linear with 100 features: 70.21%
5-Fold Accuracy for SVM Sigmoid with 100 features: 72.07%
5-Fold Accuracy for SVM RBF with 100 features: 74.30%
5-Fold Accuracy for Random Forest with 100 features: 70.40%
5-Fold Accuracy for Logistic Regression with 100 features: 73.18%
5-Fold Accuracy for KNN with 100 features: 75.24%
5-Fold Accuracy for XGBoost with 100 features: 67.79%




5-Fold Accuracy for MLP Classifier with 100 features: 72.26%




In [2]:

# Create an interactive plot using Plotly Express for each classifier
for classifier_name, accuracy_list in accuracy_dict.items():
    fig = px.line(x=feature_count_list, y=accuracy_list, labels={"x": "Number of Top Features", "y": "Accuracy"}, 
                  title=f"Accuracy Plot for {classifier_name}")
    
    fig.update_layout(legend_title_text="Classifier")
    
    # Generate file names for image and HTML files
    image_file_name = f"accuracy_plot_{classifier_name}.jpg"
    html_file_name = f"accuracy_plot_{classifier_name}_interactive.html"
    csv_file_name = f"csvdata_accuracy_plot_{classifier_name}.csv"
    
    # Save the interactive plot as a JPG image, an HTML file, and the ranked feature matrix as a CSV file
    fig.write_image(image_file_name)
    fig.write_html(html_file_name)
    
    # Convert the ranked feature matrix to a CSV file
    ranked_feature_df = pd.DataFrame(X)
    ranked_feature_df.to_csv(csv_file_name, index=False)
