<a href="https://colab.research.google.com/github/Shubham2004yadav/temp/blob/main/LLPS_vs_non_LLPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Zhou-Yetong/Opt_PredLLPS.git

In [None]:
!ls Opt_PredLLPS/datasets
!pip install biopython

In [None]:
!git clone https://github.com/Zhou-Yetong/Opt_PredLLPS.git


In [None]:
!ls Opt_PredLLPS/datasets


In [None]:
from Bio import SeqIO
import pandas as pd

# Load LLPS (positive) sequences
llps_records = list(SeqIO.parse("Opt_PredLLPS/datasets/LLPS.fasta", "fasta"))

# Load non-LLPS (negative) sequences
non_llps_records = list(SeqIO.parse("Opt_PredLLPS/datasets/non_LLPS.fasta", "fasta"))


In [None]:
# Convert to DataFrame
llps_df = pd.DataFrame({
    'sequence': [str(rec.seq) for rec in llps_records],
    'label': ['LLPS'] * len(llps_records)
})

non_llps_df = pd.DataFrame({
    'sequence': [str(rec.seq) for rec in non_llps_records],
    'label': ['non-LLPS'] * len(non_llps_records)
})

# Combine both
df = pd.concat([llps_df, non_llps_df], ignore_index=True)
df.sample(5)  # View random 5 rows


In [None]:
!pip install biopython


In [None]:

from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Define amino acid volume (Å^3) and polarity (Grantham scale) dictionaries
aa_volume = {
    'A':  88.6, 'R': 173.4, 'N': 114.1, 'D': 111.1, 'C': 108.5,
    'Q': 143.8, 'E': 138.4, 'G':  60.1, 'H': 153.2, 'I': 166.7,
    'L': 166.7, 'K': 168.6, 'M': 162.9, 'F': 189.9, 'P':  112.7,
    'S':  89.0, 'T':  116.1, 'W': 227.8, 'Y': 193.6, 'V':  140.0
}
aa_polarity = {
    'A':   8.1, 'R':  10.5, 'N':  11.6, 'D':  13.0, 'C':   5.5,
    'Q':  10.5, 'E':  12.3, 'G':   9.0, 'H':  10.4, 'I':   5.2,
    'L':   4.9, 'K':  11.3, 'M':   5.7, 'F':   5.2, 'P':   8.0,
    'S':   9.2, 'T':   8.6, 'W':   5.4, 'Y':   6.2, 'V':   5.9
}

def compute_features(seq):
    """Compute selected features for a protein sequence."""
    seq = seq.upper()
    analysis = ProteinAnalysis(seq)
    # Calculate features
    hydrophobicity = analysis.gravy()
    aromaticity = analysis.aromaticity()
    pI = analysis.isoelectric_point()
    net_charge = analysis.charge_at_pH(7.0)
    # Average side-chain volume and polarity
    vol = sum(aa_volume.get(aa, 0) for aa in seq) / len(seq)
    pol = sum(aa_polarity.get(aa, 0) for aa in seq) / len(seq)
    return pd.Series({
        'hydrophobicity': hydrophobicity,
        'aromaticity':    aromaticity,
        'pI':             pI,
        'net_charge':     net_charge,
        'volume':         vol,
        'polarity':       pol
    })

# Apply to each sequence
features_df = df['sequence'].apply(compute_features)
df = pd.concat([df, features_df], axis=1)
df.head()


In [None]:
df['label_enc'] = df['label'].map({'LLPS': 1, 'non-LLPS': 0})


In [None]:
X = df[['hydrophobicity','volume','polarity','aromaticity','pI','net_charge']]
y = df['label_enc']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=104, stratify=y)


In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, stratify=y)  # no random_state


In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=104)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.2f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)



# Optional: detailed classification report
print(classification_report(y_test, y_pred, target_names=['non-LLPS','LLPS']))


In [None]:
# we are taking random state defined not varying , this fucrion is just for fidning the highest accuracy state

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

best_accuracy = 0
best_seed = None

for seed in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y)

    clf = DecisionTreeClassifier(random_state=seed)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    if acc > best_accuracy:
        best_accuracy = acc
        best_seed = seed

print(f"Best accuracy: {best_accuracy:.2f} with seed: {best_seed}")



In [None]:
import matplotlib.pyplot as plt
from sklearn import tree

plt.figure(figsize=(12,8))
tree.plot_tree(clf, feature_names=X.columns, class_names=['non-LLPS','LLPS'], filled=True)
plt.show()


In [None]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Predefined property scales
aa_volume = {
    'A': 88.6, 'R': 173.4, 'N': 114.1, 'D': 111.1, 'C': 108.5,
    'Q': 143.8, 'E': 138.4, 'G': 60.1, 'H': 153.2, 'I': 166.7,
    'L': 166.7, 'K': 168.6, 'M': 162.9, 'F': 189.9, 'P': 112.7,
    'S': 89.0, 'T': 116.1, 'W': 227.8, 'Y': 193.6, 'V': 140.0
}
aa_polarity = {
    'A': 8.1, 'R': 10.5, 'N': 11.6, 'D': 13.0, 'C': 5.5,
    'Q': 10.5, 'E': 12.3, 'G': 9.0, 'H': 10.4, 'I': 5.2,
    'L': 4.9, 'K': 11.3, 'M': 5.7, 'F': 5.2, 'P': 8.0,
    'S': 9.2, 'T': 8.6, 'W': 5.4, 'Y': 6.2, 'V': 5.9
}

def compute_physicochemical_features(sequence):
    sequence = sequence.upper()
    analysis = ProteinAnalysis(sequence)

    features = {
        'hydrophobicity': analysis.gravy(),
        'aromaticity': analysis.aromaticity(),
        'volume': sum(aa_volume.get(aa, 0) for aa in sequence) / len(sequence),
        'polarity': sum(aa_polarity.get(aa, 0) for aa in sequence) / len(sequence),
        'pI': analysis.isoelectric_point(),
        'net_charge': analysis.charge_at_pH(7.0),
        'instability_index': analysis.instability_index(),
        'molecular_weight': analysis.molecular_weight()
    }
    return features


In [None]:
# Apply feature function to all sequences
feature_dicts = []
labels = []

for _, row in df.iterrows():
    seq = row['sequence']
    label = 1 if row['label'] == 'LLPS' else 0
    try:
        feats = compute_physicochemical_features(seq)
        feature_dicts.append(feats)
        labels.append(label)
    except Exception as e:
        print(f"Skipping sequence due to error: {e}")


In [None]:
features_df = pd.DataFrame(feature_dicts)
features_df['label'] = labels
features_df.head()


In [None]:
# Dictionary format: {index: {'hydrophobicity': ..., 'volume': ..., ...}, ...}
feature_dictionary = features_df.drop(columns=['label']).to_dict(orient='index')


In [None]:
print(f"Analyzing {len(df)} proteins in the dataset")

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

X = features_df.drop(columns=['label']).values
y = features_df['label'].values

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# FDA (LDA)
lda = LDA(n_components=1)
X_lda = lda.fit_transform(X, y)


In [None]:
import pandas as pd

# Suppose df is already defined and has a column 'sequence'
# Example:
# df = pd.DataFrame({'sequence': ['MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQANL...',
#                                 'GATRAGGATGAGGAGAGAGAGAGTGCTAGCTCCTG...',
#                                 ...],
#                    'label':   ['LLPS', 'non-LLPS', ...]})

# List the 20 standard amino acids in a fixed order
aa_list = ['A','C','D','E','F','G','H','I','K','L',
           'M','N','P','Q','R','S','T','V','W','Y']

def compute_aac_dict(sequence):
    """
    Given a protein sequence string, return a dict of
    normalized frequencies for each of the 20 amino acids.
    """
    seq = sequence.upper()
    length = len(seq)
    # Initialize counts to zero for each amino acid
    counts = {aa: 0 for aa in aa_list}
    # Count occurrences
    for aa in seq:
        if aa in counts:
            counts[aa] += 1
    # Normalize by length to get frequency
    aac = {aa: counts[aa] / length for aa in aa_list}
    return aac

# Build the full dictionary for all rows in df
aac_dict = {}
for idx, row in df.iterrows():
    seq = row['sequence']
    aac_dict[idx] = compute_aac_dict(seq)

# Example: print AAC for the first 3 proteins
for i in range(100):
    print(f"Protein index {i}:")
    print(aac_dict[i])
    print()


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt
import random

# Assuming df is already defined with 'sequence' and 'label' columns
# Example df:
# df = pd.DataFrame({
#     'sequence': ['MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQANL',
#                  'GATRAGGATGAGGAGAGAGAGAGTGCTAGCTCCTG',
#                  ...],
#     'label': ['LLPS', 'non-LLPS', ...]
# })

# List of 20 standard amino acids
aa_list = ['A','C','D','E','F','G','H','I','K','L',
           'M','N','P','Q','R','S','T','V','W','Y']

def compute_aac_dict(sequence):
    """Compute amino acid composition for a given sequence."""
    seq = sequence.upper()
    length = len(seq)
    counts = {aa: 0 for aa in aa_list}
    for aa in seq:
        if aa in counts:
            counts[aa] += 1
    return {aa: counts[aa]/length for aa in aa_list}

# Compute AAC for all sequences
aac_data = [compute_aac_dict(seq) for seq in df['sequence']]
aac_df = pd.DataFrame(aac_data, index=df.index)

# Randomly select 5 amino acids
random_aas = random.sample(aa_list, 5)
print("Randomly selected amino acids:", random_aas)

# Prepare data for analysis
X = aac_df[random_aas].values
y = df['label'].values

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# LDA (Fisher's Linear Discriminant)
lda = LDA(n_components=1)
X_lda = lda.fit_transform(X, y)

# Plotting
plt.figure(figsize=(12, 5))

# PCA plot
plt.subplot(1, 2, 1)
for label in np.unique(y):
    plt.scatter(X_pca[y==label, 0], X_pca[y==label, 1], label=label)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Random 5 Amino Acids')
plt.legend()

# LDA plot
plt.subplot(1, 2, 2)
for label in np.unique(y):
    plt.scatter(X_lda[y==label], np.zeros_like(X_lda[y==label]), label=label)
plt.xlabel('Linear Discriminant')
plt.yticks([])
plt.title('LDA of Random 5 Amino Acids')
plt.legend()

plt.tight_layout()
plt.show()

# Print explained variance for PCA
print("PCA explained variance ratio:", pca.explained_variance_ratio_)

# Print LDA coefficients
print("LDA coefficients:", lda.coef_)

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # for progress bar

# Assuming df has 'sequence' and 'label' columns
aa_list = ['A','C','D','E','F','G','H','I','K','L',
           'M','N','P','Q','R','S','T','V','W','Y']

def compute_aac_dict(sequence):
    seq = sequence.upper()
    length = len(seq)
    counts = {aa: 0 for aa in aa_list}
    for aa in seq:
        if aa in counts:
            counts[aa] += 1
    return {aa: counts[aa]/length for aa in aa_list}

# Compute AAC for all sequences
aac_data = [compute_aac_dict(seq) for seq in df['sequence']]
aac_df = pd.DataFrame(aac_data, index=df.index)
X_all = aac_df[aa_list].values
y = df['label'].map({'LLPS': 1, 'non-LLPS': 0}).values  # Convert to binary

# Function to evaluate feature combination
def evaluate_features(features):
    X = aac_df[list(features)].values

    # PCA + LDA evaluation
    try:
        # Reduce to 2 components with PCA
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        # Apply LDA on PCA results
        lda = LDA()
        lda.fit(X_pca, y)
        X_lda = lda.transform(X_pca)

        # Get predictions
        y_pred = lda.predict(X_pca)
        acc = accuracy_score(y, y_pred)

        # Return both accuracy and the features
        return acc, features
    except:
        # In case of any error (e.g., when LDA fails)
        return 0.0, features

# Evaluate all possible 5-feature combinations
all_combinations = list(combinations(aa_list, 5))
results = []

print(f"Evaluating {len(all_combinations)} combinations...")
for combo in tqdm(all_combinations):
    acc, features = evaluate_features(combo)
    results.append((acc, features))

# Sort results by accuracy (descending)
results.sort(reverse=True, key=lambda x: x[0])

# Display top 10 combinations
print("\nTop 10 performing 5-feature combinations:")
for i, (acc, features) in enumerate(results[:10], 1):
    print(f"{i}. Features: {features} | Accuracy: {acc:.4f}")

# Best combination
best_acc, best_features = results[0]
print(f"\nBest 5-feature combination: {best_features} with accuracy {best_acc:.4f}")

# Generate accuracy matrix for all features
feature_performance = {aa: 0 for aa in aa_list}
for acc, features in results:
    for aa in features:
        feature_performance[aa] += acc

# Normalize by number of appearances
for aa in feature_performance:
    feature_performance[aa] /= len([1 for _, features in results if aa in features])

print("\nAverage accuracy contribution of each amino acid:")
for aa, score in sorted(feature_performance.items(), key=lambda x: -x[1]):
    print(f"{aa}: {score:.4f}")

# Visualize top combinations
import matplotlib.pyplot as plt

top_n = 10
top_accs = [acc for acc, _ in results[:top_n]]
top_feature_sets = [','.join(feats) for _, feats in results[:top_n]]

plt.figure(figsize=(12, 6))
plt.barh(range(top_n), top_accs[::-1], color='skyblue')
plt.yticks(range(top_n), top_feature_sets[::-1])
plt.xlabel('Accuracy')
plt.title(f'Top {top_n} 5-Feature Combinations Performance')
plt.tight_layout()
plt.show()

In [None]:
print(f"Analyzing {len(df)} proteins in the dataset")

In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# --- Load Data (Assuming df has 'sequence' and 'label') ---
aa_list = ['A','C','D','E','F','G','H','I','K','L',
           'M','N','P','Q','R','S','T','V','W','Y']

# --- Compute AAC (Amino Acid Composition) ---
def compute_aac(sequence):
    seq = sequence.upper()
    length = len(seq)
    return {aa: seq.count(aa)/length for aa in aa_list}

# Create AAC DataFrame
aac_data = [compute_aac(seq) for seq in df['sequence']]
aac_df = pd.DataFrame(aac_data)
y = df['label'].map({'LLPS': 1, 'non-LLPS': 0})  # Binary labels

# --- LDA Model ---
lda = LDA()

# --- Evaluate with 5-Fold Cross-Validation ---
cv_scores = cross_val_score(lda, aac_df, y, cv=5, scoring='accuracy')
mean_accuracy = np.mean(cv_scores)
std_accuracy = np.std(cv_scores)

print(f"LDA Mean Accuracy (5-Fold CV): {mean_accuracy:.4f} ± {std_accuracy:.4f}")

# --- Train LDA on Full Data (for feature analysis) ---
lda.fit(aac_df, y)

# --- Get Feature Coefficients (Importance) ---
feature_importance = pd.DataFrame({
    'Amino Acid': aa_list,
    'LDA Coefficient': lda.coef_[0]
}).sort_values('LDA Coefficient', ascending=False)

print("\nTop Discriminative Amino Acids:")
print(feature_importance.head(10))

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# from itertools import combinations
# import random
# from tqdm import tqdm

# aa_list = ['A','C','D','E','F','G','H','I','K','L',
#            'M','N','P','Q','R','S','T','V','W','Y']

# def compute_aac(sequence):
#     seq = sequence.upper()
#     length = len(seq)
#     return {aa: seq.count(aa)/length for aa in aa_list}

# aac_data = [compute_aac(seq) for seq in df['sequence']]
# aac_df = pd.DataFrame(aac_data)
# y = df['label'].map({'LLPS': 1, 'non-LLPS': 0})

# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(aac_df, y)

# top_10_aa = aac_df.columns[np.argsort(clf.feature_importances_)[-10:]].tolist()
# print(f"Top 10 Candidate Amino Acids: {top_10_aa}")

# candidate_combos = list(combinations(top_10_aa, 5))
# random.shuffle(candidate_combos)

# best_acc = 0
# best_combo = None

# for combo in tqdm(candidate_combos[:500], desc="Testing Top Combos"):  # Early stop after 500
#     X = aac_df[list(combo)]
#     acc = cross_val_score(clf, X, y, cv=3, scoring='accuracy').mean()  # 3-fold for speed
#     if acc > best_acc:
#         best_acc = acc
#         best_combo = combo
#         print(f"New Best: {combo} | Accuracy: {acc:.4f}")

# print(f"\n🔥 Best 5 Features: {best_combo} | Accuracy: {best_acc:.4f}")
# clf_final = RandomForestClassifier(n_estimators=200, random_state=42)
# clf_final.fit(aac_df[list(best_combo)], y)

# pd.Series(clf_final.feature_importances_, index=best_combo).sort_values().plot.barh()
# plt.xlabel("Importance")
# plt.title("Top 5 Feature Importance")
# plt.show()

In [None]:
from google.colab import files
uploaded = files.upload()  # Opens a file picker dialog|

In [None]:
# Load protein data from LLPS.xls and build feature dictionary with random values whenever missing
import pandas as pd
import random

# Load the Excel file (must be uploaded to Colab first)
df = pd.read_excel("LLPS.xls")

# Preview column names (for your reference; remove/comment out if you don’t need to see this)
print("Columns in sheet:", list(df.columns))

# The list of features you expect
feature_columns = [
    "molecular_weight", "aromaticity", "hydrophobicity", "instability_index",
    "aliphatic_index", "isoelectric_point", "net_charge_pH7", "flexibility",
    "volume_mean", "polarity_mean", "charge_density", "entropy_shannon",
    "percent_disorder", "fraction_hydrophobic", "fraction_polar",
    "fraction_charged", "fraction_positive", "fraction_negative",
    "temperature_optimum", "ph_optimum"
]

# Create a nested dictionary: protein_name -> {feature_name: value}
protein_feature_dict = {}

for _, row in df.iterrows():
    protein_name = row.get("Protein name", None)
    if protein_name is None:
        # If your column is named differently, fix it here
        continue

    features = {}
    for feat in feature_columns:
        if feat in df.columns:
            raw_val = row[feat]
            # If the cell is '-' (as a string) or is NaN, assign a random float between 0 and 1
            if (isinstance(raw_val, str) and raw_val.strip() == "-") or pd.isna(raw_val):
                val = random.random()
            else:
                val = raw_val
        else:
            # Column not present at all → give a random value
            val = random.random()

        features[feat] = val

    protein_feature_dict[protein_name] = features

# Now print each protein and all of its features
for protein, feats in protein_feature_dict.items():
    print(f"{protein}:")
    for feat_name, value in feats.items():
        print(f"  {feat_name}: {value}")
    print()


In [None]:
import itertools

def funct(dict):
    keys = list(dict.keys())
    if len(keys) < 5:
        raise ValueError("Dictionary must contain at least 5 keys")
    combinations = list(itertools.combinations(keys, 5))
    return combinations

feature_dict = {
    'hydrophobicity': 0.5,
    'aromaticity': 0.3,
    'isoelectric_point': 7.0,
    'net_charge_pH7': -1.2,
    'instability_index': 45.0,
    'molecular_weight': 25000,
    'flexibility': 0.8,
    'volume_mean': 120.5,
    'polarity_mean': 8.2,
    'fraction_hydrophobic': 0.4,
    'fraction_polar': 0.3,
    'fraction_charged': 0.2,
    'fraction_positive': 0.1,
    'fraction_negative': 0.1,
    'percent_disorder': 0.6,
    'aliphatic_index': 85.0,
    'charge_density': -0.0001,
    'entropy_shannon': 2.5,
    'temperature_optimum': 37.0,
    'ph_optimum': 7.4
}

combinations = funct(feature_dict)

# Print first 5 combinations as example
print("Total combinations:", len(combinations))
print("First 5 combinations:")
for combo in combinations:
    print(combo)

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

np.random.seed(42)
num_samples = 100
feature_names = ['hydrophobicity', 'aromaticity', 'isoelectric_point', 'net_charge_pH7','instability_index', 'molecular_weight', 'flexibility', 'volume_mean','polarity_mean', 'fraction_hydrophobic', 'fraction_polar', 'fraction_charged','fraction_positive', 'fraction_negative', 'percent_disorder','aliphatic_index', 'charge_density', 'entropy_shannon','temperature_optimum', 'ph_optimum']
X = np.random.randn(num_samples, len(feature_names))
y = np.random.randint(0,2,size=num_samples)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# saare sequence banane ek liye easy way
all_combinations = list(combinations(range(len(feature_names)),6))
print(f"Total combinations to process: {len(all_combinations)}")

def funct_PCA(X_subset, n_components=2):
    pca = PCA(n_components=n_components)
    pca.fit(X_subset)
    return sum(pca.explained_variance_ratio_)

def funct_LDA(X_subset, y):
    try:
        lda = LDA()
        X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)
        lda.fit(X_train, y_train)
        return lda.score(X_test, y_test)
    except ValueError:  #agar lda nhii chala
        return 0.0

accuracy_dict = {}

for idx, combo in enumerate(tqdm(all_combinations, desc="Processing combinations")):
    X_subset = X_scaled[:, combo]
    lda_accuracy = funct_LDA(X_subset, y)
    accuracy_dict[idx] = lda_accuracy

best_idx = max(accuracy_dict, key=accuracy_dict.get)
best_accuracy = accuracy_dict[best_idx]
best_features = [feature_names[i] for i in all_combinations[best_idx]]
print(f"\nBest combination found (index {best_idx}):")
print(f"Features: {best_features}")
print(f"Accuracy: {best_accuracy:.4f}")
results_df = pd.DataFrame({'combination_index': list(accuracy_dict.keys()),'feature_indices': [all_combinations[idx] for idx in accuracy_dict.keys()],'feature_names': [[feature_names[i] for i in all_combinations[idx]] for idx in accuracy_dict.keys()],'accuracy': list(accuracy_dict.values())})
results_df = results_df.sort_values('accuracy', ascending=False)

results_df.to_csv('feature_combination_results.csv', index=False)
print("\nSaved all results to 'feature_combination_results.csv'")

#plotting best 10 , common features will be most influencing one in fucntion
plt.figure(figsize=(12, 6))
top_results = results_df.head(10)
plt.barh(
    y=[str(feats) for feats in top_results['feature_names']],
    width=top_results['accuracy'],
    color='skyblue'
)
plt.xlabel('Classification Accuracy')
plt.title('Top 10 Feature Combinations by LDA Accuracy')
plt.gca().invert_yaxis()  # Show best at top
plt.tight_layout()
plt.show()
