In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from scipy.spatial.distance import pdist, squareform
from matplotlib.patches import Ellipse
from scipy.stats import chi2


def evaluate_models(directory_path):
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
    results = []

    for csv_file in csv_files:
        file_path = os.path.join(directory_path, csv_file)
        polymers = pd.read_csv(file_path)
        polymers_data = polymers[polymers["test"] == 1]
        y = polymers_data["sample"]
        x = polymers_data.drop(["test", "sample"], axis=1)

        n_classes = len(y.unique())
        n_features = x.shape[1]
        max_components = min(n_classes - 1, n_features)

        stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
        test_scores = []

        for train_index, test_index in stratified_kfold.split(x, y):
            X_train, X_test = x.iloc[train_index], x.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            lda = LinearDiscriminantAnalysis(n_components=max_components).fit(X_train, y_train)
            score = lda.score(X_test, y_test)
            test_scores.append(score)

        overall_accuracy = np.mean(test_scores)
        
        loo = LeaveOneOut()
        loo_scores = []

        for train_index, test_index in loo.split(x):
            X_train, X_test = x.iloc[train_index], x.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            lda = LinearDiscriminantAnalysis(n_components=max_components).fit(X_train, y_train)
            score = lda.score(X_test, y_test)
            loo_scores.append(score)

        average_score = np.mean(loo_scores)
        
        x_scores = lda.transform(x)
        x_scores_df = pd.DataFrame(x_scores, columns=[f'Component {i+1}' for i in range(x_scores.shape[1])])
        x_scores_df['sample'] = y.values
        centroids = x_scores_df.groupby('sample').mean()

        centroid_distances = pdist(centroids.values, metric='euclidean')
        mean_centroid_distance = np.mean(centroid_distances)


        # Store results
        results.append({
            'File': csv_file,
            'Stratified K-Fold Accuracy': overall_accuracy * 100,
            'LOO CV Accuracy': average_score * 100,
            'Mean Centroid Distance': mean_centroid_distance,
        })
        

    # Save results to a CSV file
    results_df = pd.DataFrame(results)
    results_df.to_csv("all_first.csv", index=False)
    print("\nResults saved to 'LDA_Evaluation_Results_Homo.csv'.")

# Run evaluation on the directory
directory_path = "data/all_first"  # Specify your directory path here
evaluate_models(directory_path)