In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_agora = pd.read_excel('AGORA2_details.xlsx', index_col=0)
df_agora.head()

Unnamed: 0_level_0,PubSeedID,2 - complete comparative genomics; 1 - certain comparative genomics; 0 - no comparative genomics,Strain,Species,Genus,Family,Order,Class,Phylum,Kingdom,...,Motility,Sample Body Site,Sample Body Subsite,Genome Size * assembled,Gene Count * assembled,GC Count * assembled,GC * assembled,CDS Count * assembled,CDS % * assembled,Genome link
MicrobeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abiotrophia_defectiva_ATCC_49176,Abiotrophia defectiva ATCC 49176 (592010.4),2,Abiotrophia defectiva ATCC 49176,Abiotrophia defectiva,Abiotrophia,Aerococcaceae,Lactobacillales,Bacilli,Firmicutes,Bacteria,...,Nonmotile,,0.0,2041839.0,2009.0,959280.0,47.0,1950.0,97.06,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...
Acaricomes_phytoseiuli_DSM_14247,Acaricomes phytoseiuli DSM 14247 (1120917.3),1,Acaricomes phytoseiuli DSM 14247,Acaricomes phytoseiuli,Acaricomes,Micrococcaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria,...,Nonmotile,0.0,0.0,2419519.0,2387.0,1506904.0,62.0,2326.0,97.44,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...
Acaryochloris_marina_MBIC11017,,0,Acaryochloris marina MBIC11017,Acaryochloris marina,Acaryochloris,Acaryochloridaceae,Synechococcales,unclassified Cyanobacteria,Cyanobacteria,Bacteria,...,Nonmotile,0.0,0.0,8361599.0,8488.0,3926263.0,47.0,8409.0,99.07,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...
Acetanaerobacterium_elongatum_CGMCC_1_5012,,0,Acetanaerobacterium elongatum,Acetanaerobacterium elongatum,Acetanaerobacterium,Oscillospiraceae,Eubacteriales,Clostridia,Firmicutes,Bacteria,...,Motile,0.0,0.0,2916215.0,2842.0,1429234.0,49.0,2773.0,97.57,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900...
Acetatifactor_muris_GP69,,0,Acetatifactor muris,Acetatifactor muris,Acetatifactor,Lachnospiraceae,Eubacteriales,Clostridia,Firmicutes,Bacteria,...,Nonmotile,0.0,0.0,6013646.0,6002.0,2868509.0,48.0,5901.0,98.32,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900...


In [None]:
# taking only the top 20 abundant families
top_families = df_agora.Family.value_counts().nlargest(20).index
df_agora_top_family = df_agora[df_agora.Family.isin(top_families)]

In [8]:
# taking only the top 20 abundant classes
top_classes = df_agora.Class.value_counts().nlargest(20).index
df_agora_top_class = df_agora[df_agora.Class.isin(top_classes)]

In [None]:
# Dummy sample for demonstration
from sklearn.datasets import load_digits
data = load_digits()
X = data.data
y = data.target

# Step 1: (Recommended) Scale features to zero mean and unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: (Optional but recommended for very high-dimensional data) Reduce with PCA first
n_pca_components = min(50, X_scaled.shape[1])
pca = PCA(n_components=n_pca_components)
X_pca = pca.fit_transform(X_scaled)

# Step 3: Run t-SNE with optimized parameters
tsne = TSNE(
    n_components=2,             # 2D visualization
    perplexity=30,              # Adjust between 5 and 50 depending on dataset size
    early_exaggeration=12.0,    # Default is usually reasonable
    learning_rate='auto',       # Let the library pick the best value
    n_iter=1000,                # More iterations = more stable results
    method='barnes_hut',        # Faster for large datasets
    angle=0.5,                  # Default accuracy vs speed tradeoff
    random_state=42,            # Ensures reproducibility
    verbose=1                   # For progress output
)
X_tsne = tsne.fit_transform(X_pca)

# Step 4: Plot the t-SNE results
plt.figure(figsize=(8,6))
palette = sns.color_palette("hsv", len(np.unique(y)))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y, palette=palette, legend='full', alpha=0.7)
plt.title('t-SNE projection of dataset')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.legend(title='Class')
plt.tight_layout()
plt.show()
