In [2]:
# import 
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Feature Selection based on the type

The idea is to find the best 10 genes. The user of the dashboard will be able to choose better which genes they would be interested to visualize

## Random Forest and Features Importance Approach

In [4]:
from sklearn.ensemble import RandomForestClassifier


# 1. Prepare Data
X = pd.read_csv('data/zfish_formatted.csv').drop(columns=['Type'])
X = pd.get_dummies(X, columns=['hpf'], drop_first=True)
y = pd.read_csv('data/zfish_formatted.csv')['Type']

# 2. Train Model
# n_estimators=100 is usually enough for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

# 3. Extract & Sort Importances
importances = pd.DataFrame({
    'Gene': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 4. View Top 10
print(importances.head(10))

# 5. Visualize
# Prepare the data (Top 10)
top_10 = importances.head(10)

# Create the figure
fig = px.bar(
    top_10,
    x='Importance',
    y='Gene',
    orientation='h',  # Horizontal
    title="Top 10 Genes Driving Cell Type Identity",
    labels={'Importance': 'Importance Score', 'Gene': 'Gene Name'},
    template="plotly_white"  # Clean, professional background
)

# Crucial: Invert Y-axis so the highest bar is at the top
fig.update_layout(yaxis=dict(autorange="reversed"))

fig.show()

        Gene  Importance
302  gene302    0.015517
911  gene911    0.013091
912  gene912    0.012599
574  gene574    0.012501
805  gene805    0.012317
723  gene723    0.011434
698  gene698    0.010013
602  gene602    0.009801
828  gene828    0.009483
789  gene789    0.009457


In [5]:
# check accuracy my random forest model
accuracy = rf.score(X, y)
print(f"Random Forest Model Accuracy: {accuracy:.2%}")

Random Forest Model Accuracy: 100.00%


This 10 genes are the best 10 genes if you want to see how the vary between different cell types 

In [7]:
# saving results
top_10.to_csv('results/feature_importances_top10_rf.csv', index=False)

## Approach using ANOVA
the idea is to check which genes have sensitive different averages based on the cell type

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif

# Compute ANOVA F-value for each gene
selector = SelectKBest(f_classif, k='all')
selector.fit(X, y)

anova_scores = pd.DataFrame({
    'Gene_an': X.columns,
    'F_Score': selector.scores_
}).sort_values(by='F_Score', ascending=False)

print(anova_scores.head(10))

     Gene_an       F_Score
699  gene699  10282.991972
698  gene698   9612.653611
47    gene47   9560.630877
547  gene547   9439.986328
302  gene302   9283.856359
633  gene633   8994.750737
990  gene990   8665.892242
673  gene673   8437.590507
316  gene316   8380.709796
707  gene707   8219.423161


In [10]:
# Save top 10 ANOVA scores
anova_top_10 = anova_scores.head(10)

anova_top_10.to_csv('results/feature_importances_top10_anova.csv', index=False)

## Comparison between two approaches and best genes

In [11]:
# interesection from both methods
top_rf = set(importances['Gene'].head(30))
top_anova = set(anova_scores['Gene_an'].head(30))
intersection = top_rf.intersection(top_anova)
print("Common Top Genes from Both Methods:")
for gene in intersection:
    print(gene)

Common Top Genes from Both Methods:
gene47
gene692
gene698
gene302
gene547
gene574


In [12]:
# save best genes with f-score and importance score to csv
best_genes = anova_scores[anova_scores['Gene_an'].isin(intersection)].merge(
    importances[importances['Gene'].isin(intersection)], left_on='Gene_an', right_on='Gene')
best_genes.to_csv('results/feature_importances_intersection.csv', index=False)
