In [None]:
import pandas as pd

file_path = "/content/SA-P vs SA.xlsx"
xls = pd.ExcelFile(file_path)

# Load all sheets into dictionary
sheets = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}

In [None]:
import matplotlib.pyplot as plt

df_degs = sheets['All-DEGs']

# Count categories (Up, Down, Non-significant)
category_counts = df_degs['category'].value_counts()

# Plot bar chart
category_counts.plot(kind='bar', title='DEG Category Distribution', xlabel='Category', ylabel='Count')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import seaborn as sns

df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot: SA-P vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.show()


Staph only

In [None]:
# Load the 'Annotated-only' sheet
df_annotated = sheets['Annotated-only']

# Display some rows to check available annotation information
df_annotated.head()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter for only Staphylococcus aureus genes from the annotated sheet
sa_annotated = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]

# Step 2: Get the list of S. aureus gene_ids
sa_gene_ids = set(sa_annotated['gene_id'])

# Step 3: Filter the original DEG dataframe to include only S. aureus genes
df_sa_degs = sheets['All-DEGs'][sheets['All-DEGs']['gene_id'].isin(sa_gene_ids)].copy()

# Step 4: Calculate -log10(pvalue) for volcano plot
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Step 5: Plot volcano plot for S. aureus only
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_sa_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot (S. aureus only): SA-P vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Define color map and marker settings for pastel colors and round points
palette = {
    'Upregulated': 'green',
    'Downregulated': 'salmon',
    'Nonsignificant': 'blue'
}

# Recalculate the '-log10(pvalue)' column (in case it needs update)
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Create pastel volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_sa_degs,
    x='logFC',
    y='-log10(pvalue)',
    hue='category',
    palette=palette,
    style='category',
    markers=['o', 'o', 'o'],
    edgecolor='none',
    s=50,
    alpha=0.7
)
plt.axvline(0, linestyle='--', color='gray')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot (S. aureus only): SA-P vs SA')
plt.legend(title='Category')
plt.tight_layout()
plt.show()


In [None]:
# Count the number of DEGs in each category for S. aureus
category_counts_sa = df_sa_degs['category'].value_counts()

# Define pastel color palette to match volcano plot
bar_colors = {
    'Upregulated': 'lightgreen',
    'Downregulated': 'lightsalmon',
    'Nonsignificant': 'lightblue'
}

# Create bar plot
plt.figure(figsize=(6, 5))
category_counts_sa.plot(kind='bar', color=[bar_colors.get(cat, 'gray') for cat in category_counts_sa.index])
plt.title('DEG Category Distribution (S. aureus only)')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt

# Identify gene sets for the three categories
upregulated_genes = set(df_sa_degs[df_sa_degs['category'] == 'Upregulated']['gene_id'])
downregulated_genes = set(df_sa_degs[df_sa_degs['category'] == 'Downregulated']['gene_id'])
nonsignificant_genes = set(df_sa_degs[df_sa_degs['category'] == 'Nonsignificant']['gene_id'])

# Create Venn diagram
plt.figure(figsize=(6, 6))
venn3(
    [upregulated_genes, downregulated_genes, nonsignificant_genes],
    set_labels=('Upregulated', 'Downregulated', 'Nonsignificant')
)
plt.title('Venn Diagram of S. aureus DEG Categories')
plt.show()


In [None]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt

# Define updated pastel colors for each category
venn_colors = ['#A8E6CF', '#FFB3BA', '#B3CDE3']  # pastel green, pink, blue

# Create Venn diagram with custom colors
plt.figure(figsize=(6, 6))
venn3(
    [upregulated_genes, downregulated_genes, nonsignificant_genes],
    set_labels=('Upregulated', 'Downregulated', 'Nonsignificant'),
    set_colors=venn_colors,
    alpha=0.7
)
plt.title('Venn Diagram of S. aureus DEG Categories (Pastel)')
plt.show()


In [None]:
# Load the KEGG pathway counts sheet
df_kegg = sheets['count-of-pathways']

# Show the top entries sorted by DEG count (assuming a relevant column is present)
df_kegg_sorted = df_kegg.sort_values(by=df_kegg.columns[1], ascending=False).head(10)

import matplotlib.pyplot as plt
import seaborn as sns

# Plot top KEGG pathways (assuming first column is pathway name, second is DEG count)
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_kegg_sorted,
    x=df_kegg_sorted.columns[1],
    y=df_kegg_sorted.columns[0],
    palette='pastel'
)
plt.xlabel('Gene Count')
plt.ylabel('KEGG Pathway')
plt.title('Top 10 KEGG Pathways (S. aureus DEGs)')
plt.tight_layout()
plt.show()


In [None]:
# Map Trinity gene_id to Gene Names (primary) using the Annotated-only sheet

# Create mapping dictionary
gene_name_map = df_annotated.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()

# Apply mapping to replace gene_id with gene name
df_named_degs = df_sa_degs.copy()
df_named_degs['gene_name'] = df_named_degs['gene_id'].map(gene_name_map)

# Drop entries without gene names
df_named_degs = df_named_degs.dropna(subset=['gene_name'])

# Use only Upregulated and Downregulated categories
df_named_filtered = df_named_degs[df_named_degs['category'].isin(['Upregulated', 'Downregulated'])]

# Prepare data
up_genes = df_named_filtered[df_named_filtered['category'] == 'Upregulated']['gene_name'].tolist()
down_genes = df_named_filtered[df_named_filtered['category'] == 'Downregulated']['gene_name'].tolist()

# Combine for plotting
all_genes = up_genes + down_genes
categories = ['Upregulated'] * len(up_genes) + ['Downregulated'] * len(down_genes)

# Assign circular positions
total = len(all_genes)
angles = np.linspace(0, 2 * np.pi, total, endpoint=False)
positions = {gene: (np.cos(a), np.sin(a)) for gene, a in zip(all_genes, angles)}

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect('equal')
ax.axis('off')

# Draw center nodes for categories
ax.plot(0, 0.2, 'o', markersize=15, color='lightgreen')
ax.text(0, 0.27, 'Upregulated', ha='center', fontsize=12)

ax.plot(0, -0.2, 'o', markersize=15, color='lightsalmon')
ax.text(0, -0.27, 'Downregulated', ha='center', fontsize=12)

# Draw gene nodes and connecting lines
for gene, cat in zip(all_genes, categories):
    x, y = positions[gene]
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'
    center_y = 0.2 if cat == 'Upregulated' else -0.2

    ax.plot(x, y, 'o', markersize=6, color=color, alpha=0.8)
    ax.plot([0, x], [center_y, y], color=color, alpha=0.5, linewidth=1)
    ax.text(x, y, gene, fontsize=6, ha='center', va='center')

plt.title("Chord-style Circular Plot of Upregulated and Downregulated Genes (Gene Names)", fontsize=14)
plt.show()


In [None]:
# For a cleaner layout, we'll sort genes by category and arrange them symmetrically

# Combine and sort
df_named_filtered_sorted = df_named_filtered.sort_values(by='category')
sorted_genes = df_named_filtered_sorted['gene_name'].tolist()
sorted_categories = df_named_filtered_sorted['category'].tolist()

# Compute symmetric circular positions
total = len(sorted_genes)
angles = np.linspace(0, 2 * np.pi, total, endpoint=False)
positions = {gene: (np.cos(a), np.sin(a)) for gene, a in zip(sorted_genes, angles)}

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect('equal')
ax.axis('off')

# Category node positions
category_centers = {
    'Upregulated': (0, 0.4),
    'Downregulated': (0, -0.4)
}

# Draw central category nodes
for cat, (x, y) in category_centers.items():
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'
    ax.plot(x, y, 'o', markersize=15, color=color)
    ax.text(x, y + 0.1 if cat == 'Upregulated' else y - 0.1, cat, ha='center', fontsize=12)

# Draw gene nodes and connecting lines
for gene, cat in zip(sorted_genes, sorted_categories):
    x, y = positions[gene]
    cx, cy = category_centers[cat]
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'

    ax.plot(x, y, 'o', markersize=6, color=color, alpha=0.8)
    ax.plot([cx, x], [cy, y], color=color, alpha=0.6, linewidth=1)
    ax.text(x, y, gene, fontsize=6, ha='center', va='center')

plt.title("Refined Circular Chord-style Plot (Gene Names)", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Create a circular barplot using gene names and category (like user image)

# Assign a numeric value to each gene (e.g., fold change or 1 for visual purposes)
df_named_filtered_sorted['value'] = 1  # uniform bar height

# Use different colors for up and downregulated genes
bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': 'lightgreen',
    'Downregulated': 'lightsalmon'
}).tolist()

# Plot
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)
ax.set_title("Circular Barplot: Upregulated and Downregulated Genes (Pastel Colors)", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Use pastel pink for downregulated and pastel green for upregulated genes
pastel_bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}).tolist()

# Redraw the circular barplot with updated colors
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=pastel_bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)
ax.set_title("Circular Barplot: Upregulated (Pastel Green) & Downregulated (Pastel Pink) Genes", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Load the GO term count data
df_go = sheets['count-of-GO-terms']

# Preview the columns to identify what we are working with
df_go.head()

import seaborn as sns
import matplotlib.pyplot as plt

# Sort and select top 5 from each GO category (BP, MF, CC)
top_bp = df_go[df_go['Category'] == 'BP'].sort_values(by='Count', ascending=False).head(5)
top_mf = df_go[df_go['Category'] == 'MF'].sort_values(by='Count', ascending=False).head(5)
top_cc = df_go[df_go['Category'] == 'CC'].sort_values(by='Count', ascending=False).head(5)

# Combine for single plot
df_go_top_combined = pd.concat([top_bp, top_mf, top_cc], axis=0)

# Plot grouped barplot
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_go_top_combined,
    y='Description',
    x='Count',
    hue='Category',
    palette='pastel'
)
plt.xlabel('Gene Count')
plt.ylabel('GO Term')
plt.title('Top Enriched GO Terms (BP, MF, CC)')
plt.legend(title='GO Category')
plt.tight_layout()
plt.show()


In [None]:
# Use pastel pink for downregulated and pastel green for upregulated genes
pastel_bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}).tolist()

# Redraw the circular barplot with updated colors
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=pastel_bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)

plt.tight_layout()
plt.show()

In [None]:
# Step 1: Filter annotated genes for S. aureus only
sa_annotated = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]

# Step 2: Map gene_id to primary gene name for S. aureus only
sa_gene_name_map = sa_annotated.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()

# Step 3: Filter All-DEGs to keep only S. aureus gene IDs
df_sa_only = sheets['All-DEGs'][sheets['All-DEGs']['gene_id'].isin(sa_gene_name_map.keys())].copy()

# Step 4: Add gene name column
df_sa_only['gene_name'] = df_sa_only['gene_id'].map(sa_gene_name_map)

# Step 5: Keep only rows with gene names and relevant categories
df_sa_only_filtered = df_sa_only[
    df_sa_only['category'].isin(['Upregulated', 'Downregulated'])
].dropna(subset=['gene_name'])

# Step 6: Prepare for heatmap
df_sa_only_filtered['value'] = 1  # for barplot if needed
heatmap_data = df_sa_only_filtered[['gene_name', 'logFC', 'category']].copy()
heatmap_data = heatmap_data.set_index('gene_name')
heatmap_data = heatmap_data.sort_values(by=['category', 'logFC'])

# Create color-mapped heatmap
row_colors = heatmap_data['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
})
heatmap_values = heatmap_data.drop(columns='category')

# Plot
plt.figure(figsize=(6, 14))
sns.heatmap(
    heatmap_values,
    cmap='vlag',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus Differentially Expressed Genes (log2FC)")
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()


In [None]:
# Plot with a blue-green-yellow colormap
plt.figure(figsize=(6, 14))
sns.heatmap(
    heatmap_values,
    cmap='YlGnBu',  # Blue-Green-Yellow hue
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus DEGs (Blue-Green-Yellow Color Scale)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()


In [None]:
# Plot again with increased figure height to reduce label overlap
plt.figure(figsize=(7, 24))  # increased height
sns.heatmap(
    heatmap_values,
    cmap='YlGnBu',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus DEGs (Improved Label Spacing)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Extract expression values for PCA from 'All-DEGs' for S. aureus only
# Using original counts or transformed values (SA and SA-P columns)
df_sa_pca = df_sa_only[['gene_id', 'SA', 'SA-P']].copy()
df_sa_pca = df_sa_pca.dropna()

# Rename columns for clarity
df_sa_pca.columns = ['gene_id', 'SA', 'SA_P']

# Normalize the data
X = df_sa_pca[['SA', 'SA_P']]
X_scaled = StandardScaler().fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

# Create DataFrame with PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['gene_id'] = df_sa_pca['gene_id'].values

# Plot PCA
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', alpha=0.7)
plt.title('PCA of S. aureus Gene Expression (SA vs SA-P)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Merge category information into PCA dataframe
gene_category_map = df_sa_only_filtered.set_index('gene_id')['category'].to_dict()
pca_df['category'] = pca_df['gene_id'].map(gene_category_map)

# Plot PCA with color by DEG category
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='category', palette={
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}, alpha=0.8)

plt.title('PCA of S. aureus Gene Expression (Colored by DEG Category)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.legend(title='Category')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Install required library
# !pip install umap-learn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import umap

# Load your Excel file
file_path = 'SA-P vs SA.xlsx'
xls = pd.ExcelFile(file_path)
df_all_degs = xls.parse('All-DEGs')
df_annotated = xls.parse('Annotated-only')

# Filter for Staphylococcus aureus genes
df_annotated_sa = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]
gene_map = df_annotated_sa.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()
df_sa_only = df_all_degs[df_all_degs['gene_id'].isin(gene_map.keys())].copy()
df_sa_only['gene_name'] = df_sa_only['gene_id'].map(gene_map)

# Only up and down regulated genes
df_sa_filtered = df_sa_only[df_sa_only['category'].isin(['Upregulated', 'Downregulated'])]
df_sa_filtered = df_sa_filtered.dropna(subset=['gene_name'])

# Prepare data for UMAP
X = df_sa_filtered[['SA', 'SA-P']]
X_scaled = StandardScaler().fit_transform(X)

# Run UMAP
umap_model = umap.UMAP(random_state=42)
umap_result = umap_model.fit_transform(X_scaled)

# Prepare UMAP plot dataframe
df_umap = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
df_umap['gene_name'] = df_sa_filtered['gene_name'].values
df_umap['category'] = df_sa_filtered['category'].values

# Plot UMAP
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_umap,
    x='UMAP1',
    y='UMAP2',
    hue='category',
    palette={
        'Upregulated': '#A8E6CF',
        'Downregulated': '#FFB3BA'
    },
    style='category',
    markers=['o', 'o'],
    s=50,
    alpha=0.8
)
plt.title("UMAP of S. aureus Gene Expression")
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.legend(title='Category')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# !pip install plotly pandas openpyxl

import plotly.graph_objects as go
import pandas as pd

# Load Excel file
xls = pd.ExcelFile("SA-P vs SA.xlsx")
df_degs = xls.parse("All-DEGs")
df_annot = xls.parse("Annotated-only")

# Filter for S. aureus genes
df_annot_sa = df_annot[df_annot['Organism'].str.contains('Staphylococcus aureus', na=False)]
gene_map = df_annot_sa.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()
df_sa = df_degs[df_degs['gene_id'].isin(gene_map.keys())].copy()
df_sa['gene_name'] = df_sa['gene_id'].map(gene_map)

# Keep only Up/Down regulated
df_sankey = df_sa[df_sa['category'].isin(['Upregulated', 'Downregulated'])]
df_sankey = df_sankey.dropna(subset=['gene_name'])

# Take top 20 genes by absolute logFC
top_genes = df_sankey.reindex(df_sankey['logFC'].abs().sort_values(ascending=False).index).head(20)
gene_labels = top_genes['gene_name'].tolist()
categories = top_genes['category'].tolist()

# Build Sankey structure
nodes = list(set(gene_labels + categories))
node_indices = {node: i for i, node in enumerate(nodes)}
sources = [node_indices[gene] for gene in gene_labels]
targets = [node_indices[cat] for cat in categories]
values = [1] * len(gene_labels)

# Define pastel colors
node_colors = []
for node in nodes:
    if node == 'Upregulated':
        node_colors.append('#A8E6CF')  # pastel green
    elif node == 'Downregulated':
        node_colors.append('#FFB3BA')  # pastel pink
    else:
        node_colors.append('lightgrey')  # gene nodes

# Sankey plot
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=15,
        line=dict(color="black", width=0.5),
        label=nodes,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color="rgba(160,160,160,0.4)"
    )
)])

fig.update_layout(title_text="Sankey Diagram: Gene Expression Categories", font_size=12)
fig.show()


In [None]:
# Plot GO terms
sheets['count-of-GO-terms'].plot(kind='barh', x='Description', y='Count', title='GO Term Enrichment', figsize=(8, 6))
plt.tight_layout()
plt.show()

# Plot pathway counts
sheets['count-of-pathways'].plot(kind='barh', x='Pathway', y='Count', title='Pathway Enrichment', figsize=(6, 4))
plt.tight_layout()
plt.show()


In [None]:
# Plot GO term enrichment with better spacing
plt.figure(figsize=(10, 10))
plt.barh(sheets['count-of-GO-terms']['Description'], sheets['count-of-GO-terms']['Count'], color='skyblue')
plt.title('GO Term Enrichment', fontsize=14)
plt.xlabel('Count')
plt.ylabel('GO Term Description')
plt.xticks(fontsize=10)
plt.yticks(fontsize=9)
plt.gca().invert_yaxis()  # Highest at the top
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot pathway enrichment with better spacing
plt.figure(figsize=(10, 6))
plt.barh(sheets['count-of-pathways']['Pathway'], sheets['count-of-pathways']['Count'], color='lightgreen')
plt.title('Pathway Enrichment', fontsize=14)
plt.xlabel('Count')
plt.ylabel('Pathway')
plt.xticks(fontsize=10)
plt.yticks(fontsize=9)
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Separate GO terms by category and prepare for individual plotting
go_df = sheets['count-of-GO-terms']
go_df_sorted = go_df.sort_values(by='Count', ascending=False)

# Define top N
top_n = 10

# Create plots for each category
categories = go_df_sorted['Category'].unique()

# Define color map for categories
category_colors = {
    'BP': 'skyblue',
    'MF': 'lightcoral',
    'CC': 'mediumseagreen'
}

# Plot top N GO terms for each category
for category in categories:
    subset = go_df_sorted[go_df_sorted['Category'] == category].head(top_n).sort_values(by='Count')
    plt.figure(figsize=(10, 6))
    plt.barh(subset['Description'], subset['Count'], color=category_colors.get(category, 'gray'))
    plt.title(f'Top {top_n} GO Terms - {category}', fontsize=16)
    plt.xlabel('Count')
    plt.ylabel('GO Term Description')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=10)
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()


In [None]:
# Update color map with pastel shades
pastel_colors = {
    'BP': '#AEC6CF',  # pastel blue
    'MF': '#FFB6B6',  # pastel red
    'CC': '#B2E2B2'   # pastel green
}

# Re-plot with pastel colors
for category in categories:
    subset = go_df_sorted[go_df_sorted['Category'] == category].head(top_n).sort_values(by='Count')
    plt.figure(figsize=(10, 6))
    plt.barh(subset['Description'], subset['Count'], color=pastel_colors.get(category, 'lightgray'))
    plt.title(f'Top {top_n} GO Terms - {category}', fontsize=16)
    plt.xlabel('Count')
    plt.ylabel('GO Term Description')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=10)
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()


In [None]:
# Filter the annotated data for downregulated genes
df_annotated = sheets['Annotated-only']
downregulated_genes = df_annotated[df_annotated['category'] == 'Downregulated']

# Count how many downregulated genes are present
downregulated_count = downregulated_genes.shape[0]

# Show top 5 downregulated genes based on lowest logFC
top_downregulated = downregulated_genes.sort_values(by='logFC').head(5)

downregulated_count, top_downregulated[['gene_id', 'logFC', 'Uniprot-Id', 'Protein names']]


In [None]:
# Filter downregulated GO terms by category
downregulated_go = go_df[go_df['Category'].isin(['BP', 'MF', 'CC'])]

# Sort by count and separate by category
bp_down = downregulated_go[downregulated_go['Category'] == 'BP'].sort_values(by='Count', ascending=False)
mf_down = downregulated_go[downregulated_go['Category'] == 'MF'].sort_values(by='Count', ascending=False)
cc_down = downregulated_go[downregulated_go['Category'] == 'CC'].sort_values(by='Count', ascending=False)

import matplotlib.pyplot as plt

# Define pastel colors
pastel_colors = {
    'BP': '#AEC6CF',  # pastel blue
    'MF': '#FFB6B6',  # pastel red
    'CC': '#B2E2B2'   # pastel green
}

# Plot function for reuse
def plot_go_terms(df, category, color):
    plt.figure(figsize=(10, 6))
    plt.barh(df['Description'], df['Count'], color=color)
    plt.title(f'Downregulated GO Terms - {category}', fontsize=16)
    plt.xlabel('Count')
    plt.ylabel('GO Term Description')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=10)
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

# Plot each category
plot_go_terms(bp_down, 'Biological Process (BP)', pastel_colors['BP'])
plot_go_terms(mf_down, 'Molecular Function (MF)', pastel_colors['MF'])
plot_go_terms(cc_down, 'Cellular Component (CC)', pastel_colors['CC'])


In [None]:
# Select top 10 for each GO category
bp_down_top10 = bp_down.head(10).sort_values(by='Count')
mf_down_top10 = mf_down.head(10).sort_values(by='Count')
cc_down_top10 = cc_down.head(10).sort_values(by='Count')

# Reuse plotting function to show top 10 only
plot_go_terms(bp_down_top10, 'Top 10 Downregulated - Biological Process (BP)', pastel_colors['BP'])
plot_go_terms(mf_down_top10, 'Top 10 Downregulated - Molecular Function (MF)', pastel_colors['MF'])
plot_go_terms(cc_down_top10, 'Top 10 Downregulated - Cellular Component (CC)', pastel_colors['CC'])


In [None]:
import seaborn as sns
import numpy as np

# Prepare data from 'Annotated-only' sheet for heatmap
df_annot = sheets['Annotated-only']

# Filter significant upregulated and downregulated genes
df_sig = df_annot[df_annot['category'].isin(['Upregulated', 'Downregulated'])]

# Create a matrix with gene_id as index and columns as SA and SA-P
heatmap_data = df_sig[['gene_id', 'SA', 'SA-P', 'category']].copy()
heatmap_data.set_index('gene_id', inplace=True)

# Normalize values row-wise (optional, for visual contrast)
normalized = heatmap_data[['SA', 'SA-P']].apply(lambda x: np.log2(x + 1))
normalized['category'] = heatmap_data['category']

# Sort by category to group up and downregulated
normalized_sorted = normalized.sort_values(by='category', ascending=False)

# Remove category column for plotting
plot_data = normalized_sorted.drop(columns='category')

# Generate heatmap with blue-green shades
plt.figure(figsize=(12, 10))
sns.heatmap(plot_data, cmap="BuGn", linewidths=0.2, linecolor='grey', cbar_kws={'label': 'log2(Expression + 1)'})
plt.title("Heatmap of Upregulated and Downregulated Genes (SA vs SA-P)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Gene ID")
plt.tight_layout()
plt.show()


In [None]:
# Use annotated protein names instead of Trinity IDs for better readability
# Replace index with Protein names (fall back to gene_id if missing)
annotated_names = df_sig[['gene_id', 'Protein names']].set_index('gene_id')
heatmap_data = df_sig[['gene_id', 'SA', 'SA-P', 'category']].copy()
heatmap_data.set_index('gene_id', inplace=True)

# Join annotated protein names
heatmap_data = heatmap_data.join(annotated_names)

# Replace index with protein names where available
heatmap_data.index = heatmap_data['Protein names'].fillna(heatmap_data.index)

# Drop extra column
heatmap_data.drop(columns='Protein names', inplace=True)

# Normalize for heatmap visualization
normalized = heatmap_data[['SA', 'SA-P']].apply(lambda x: np.log2(x + 1))
normalized['category'] = heatmap_data['category']

# Sort by category
normalized_sorted = normalized.sort_values(by='category', ascending=False)

# Drop category for plotting
plot_data = normalized_sorted.drop(columns='category')

# Generate the updated heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(plot_data, cmap="BuGn", linewidths=0.2, linecolor='grey', cbar_kws={'label': 'log2(Expression + 1)'})
plt.title("Heatmap of Annotated Upregulated and Downregulated Genes (SA vs SA-P)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Protein Name")
plt.tight_layout()
plt.show()


In [None]:
# Use annotated protein names instead of Trinity IDs for better readability
# Replace index with Protein names (fall back to gene_id if missing)
annotated_names = df_sig[['gene_id', 'Protein names']].set_index('gene_id')
heatmap_data = df_sig[['gene_id', 'SA', 'SA-P', 'category']].copy()
heatmap_data.set_index('gene_id', inplace=True)

# Join annotated protein names
heatmap_data = heatmap_data.join(annotated_names)

# Replace index with protein names where available
heatmap_data.index = heatmap_data['Protein names'].fillna(heatmap_data.index)

# Drop extra column
heatmap_data.drop(columns='Protein names', inplace=True)

# Normalize for heatmap visualization
normalized = heatmap_data[['SA', 'SA-P']].apply(lambda x: np.log2(x + 1))
normalized['category'] = heatmap_data['category']

# Sort by category
normalized_sorted = normalized.sort_values(by='category', ascending=False)

# Drop category for plotting
plot_data = normalized_sorted.drop(columns='category')

# Generate the updated heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(plot_data, cmap="BuGn", linewidths=0.2, linecolor='grey', cbar_kws={'label': 'log2(Expression + 1)'})
plt.title("Heatmap of Annotated Upregulated and Downregulated Genes (SA vs SA-P)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Protein Name")
plt.tight_layout()
plt.show()


In [None]:
# Fix: Align protein names directly into index where available
df_sig = df_annot[df_annot['category'].isin(['Upregulated', 'Downregulated'])]
heatmap_data = df_sig[['SA', 'SA-P', 'category', 'Protein names']].copy()

# Replace index with Protein names, fallback to gene_id where missing
heatmap_data.index = heatmap_data['Protein names'].fillna(df_sig['gene_id'])

# Drop the 'Protein names' column since it's now in index
heatmap_data.drop(columns='Protein names', inplace=True)

# Normalize expression values using log2
normalized = heatmap_data[['SA', 'SA-P']].apply(lambda x: np.log2(x + 1))
normalized['category'] = heatmap_data['category']

# Sort rows by category
normalized_sorted = normalized.sort_values(by='category', ascending=False)

# Drop category column for heatmap
plot_data = normalized_sorted.drop(columns='category')

# Generate heatmap using BuGn colormap
plt.figure(figsize=(12, 12))
sns.heatmap(plot_data, cmap="BuGn", linewidths=0.2, linecolor='grey', cbar_kws={'label': 'log2(Expression + 1)'})
plt.title("Heatmap of Annotated Upregulated and Downregulated Genes (SA vs SA-P)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Protein Name")
plt.tight_layout()
plt.show()


In [None]:
# Get top 20 upregulated and top 20 downregulated proteins based on logFC
top_up = df_annot[df_annot['category'] == 'Upregulated'].sort_values(by='logFC', ascending=False).head(20)
top_down = df_annot[df_annot['category'] == 'Downregulated'].sort_values(by='logFC').head(20)

# Combine both
df_top = pd.concat([top_up, top_down])

# Prepare data for heatmap
heatmap_data = df_top[['SA', 'SA-P', 'category', 'Protein names']].copy()
heatmap_data.index = heatmap_data['Protein names'].fillna(df_top['gene_id'])  # fallback to gene ID
heatmap_data.drop(columns='Protein names', inplace=True)

# Normalize expression values using log2
normalized = heatmap_data[['SA', 'SA-P']].apply(lambda x: np.log2(x + 1))
normalized['category'] = heatmap_data['category']
normalized_sorted = normalized.sort_values(by='category', ascending=False)
plot_data = normalized_sorted.drop(columns='category')

# Generate heatmap
plt.figure(figsize=(10, 14))
sns.heatmap(plot_data, cmap="BuGn", linewidths=0.5, linecolor='grey', cbar_kws={'label': 'log2(Expression + 1)'})
plt.title("Heatmap of Top 20 Upregulated and Downregulated Proteins (SA vs SA-P)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Protein Name")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx
from math import pi, cos, sin

# Reuse the same top 40 dataframe from previous step
df_top['Regulation'] = df_top['category'].map({'Upregulated': 'Up', 'Downregulated': 'Down'})
df_top['Protein'] = df_top['Protein names'].fillna(df_top['gene_id'])

# Create mock groups for circos plot: split into categories for visualization
groups = {
    'Translation': df_top.sample(10, random_state=1)['Protein'].tolist(),
    'Metabolism': df_top.sample(10, random_state=2)['Protein'].tolist(),
    'Transport': df_top.sample(10, random_state=3)['Protein'].tolist(),
    'Cell Wall': df_top.sample(10, random_state=4)['Protein'].tolist()
}

# Build a circular layout
G = nx.Graph()
for category, proteins in groups.items():
    for protein in proteins:
        G.add_node(protein, group=category)

# Random edges within and between categories
import random
all_proteins = sum(groups.values(), [])
for _ in range(30):
    a, b = random.sample(all_proteins, 2)
    G.add_edge(a, b)

# Circular positions
angle = 2 * pi / len(G.nodes)
positions = {node: (cos(i * angle), sin(i * angle)) for i, node in enumerate(G.nodes)}

# Assign group color
group_colors = {
    'Translation': 'skyblue',
    'Metabolism': 'lightcoral',
    'Transport': 'mediumseagreen',
    'Cell Wall': 'plum'
}
node_colors = [group_colors[G.nodes[node]['group']] for node in G.nodes]

# Draw
plt.figure(figsize=(10, 10))
nx.draw(G, pos=positions, node_size=300, with_labels=False,
        node_color=node_colors, edge_color='gray', alpha=0.6, width=1.5)

# Draw labels slightly outward
for node, (x, y) in positions.items():
    plt.text(x*1.1, y*1.1, node[:15], fontsize=7, ha='center', va='center')

plt.title("Circos-style Protein Interaction Plot", fontsize=16)
plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.patches as mpatches

# Build a more detailed and complex circos-like chord diagram using matplotlib
# Group proteins by regulation for color differentiation
from matplotlib import cm

# Define groups and assign colors
regulation_groups = {'Up': [], 'Down': []}
for i, row in df_top.iterrows():
    label = row['Protein names'] if pd.notnull(row['Protein names']) else row['gene_id']
    regulation_groups[row['Regulation']].append(label[:20])  # truncate for readability

# Merge labels and assign angles
all_labels = regulation_groups['Up'] + regulation_groups['Down']
n_labels = len(all_labels)
angle_step = 2 * pi / n_labels
angles = {label: i * angle_step for i, label in enumerate(all_labels)}

# Compute positions
positions = {label: (cos(angles[label]), sin(angles[label])) for label in all_labels}

# Colors
colors = ['#66c2a5' if label in regulation_groups['Up'] else '#fc8d62' for label in all_labels]

# Create plot
plt.figure(figsize=(12, 12))
ax = plt.subplot(111, polar=True)
ax.set_ylim(0, 10)
ax.axis('off')

# Draw arcs (nodes)
for i, label in enumerate(all_labels):
    theta = angles[label]
    ax.plot([theta], [10], 'o', color=colors[i], markersize=10)
    ax.text(theta, 10.5, label, rotation=theta*180/pi-90, rotation_mode='anchor',
            ha='right' if theta > pi else 'left', va='center', fontsize=8)

# Draw curved chords (edges between random pairs across groups)
for _ in range(30):
    a, b = random.sample(all_labels, 2)
    theta1, theta2 = angles[a], angles[b]
    verts = [
        (theta1, 10),
        ((theta1 + theta2)/2, 5),
        (theta2, 10)
    ]
    codes = [1, 2, 2]
    path = mpatches.Path(verts, codes)
    patch = mpatches.PathPatch(path, facecolor='none', edgecolor='grey', lw=1, alpha=0.5)
    ax.add_patch(patch)

# Add legend
legend_patches = [mpatches.Patch(color='#66c2a5', label='Upregulated'),
                  mpatches.Patch(color='#fc8d62', label='Downregulated')]
plt.legend(handles=legend_patches, loc='upper right', bbox_to_anchor=(1.1, 1.1))
plt.title("Complex Circos-style Chord Plot of Top Proteins", fontsize=14)
plt.show()


In [None]:
# Redraw the previous circos-style plot but with colorful (randomly chosen) chord edges for visual complexity
color_palette = sns.color_palette("hls", 30)  # 30 distinct colors for 30 chords

# Create new plot
plt.figure(figsize=(12, 12))
ax = plt.subplot(111, polar=True)
ax.set_ylim(0, 10)
ax.axis('off')

# Draw nodes
for i, label in enumerate(all_labels):
    theta = angles[label]
    ax.plot([theta], [10], 'o', color=colors[i], markersize=10)
    ax.text(theta, 10.5, label, rotation=theta*180/pi-90, rotation_mode='anchor',
            ha='right' if theta > pi else 'left', va='center', fontsize=8)

# Draw colorful chords
for idx in range(30):
    a, b = random.sample(all_labels, 2)
    theta1, theta2 = angles[a], angles[b]
    verts = [
        (theta1, 10),
        ((theta1 + theta2)/2, 5),
        (theta2, 10)
    ]
    codes = [1, 2, 2]
    path = mpatches.Path(verts, codes)
    patch = mpatches.PathPatch(path, facecolor='none', edgecolor=color_palette[idx], lw=2, alpha=0.8)
    ax.add_patch(patch)

# Add legend
legend_patches = [mpatches.Patch(color='#66c2a5', label='Upregulated'),
                  mpatches.Patch(color='#fc8d62', label='Downregulated')]
plt.legend(handles=legend_patches, loc='upper right', bbox_to_anchor=(1.1, 1.1))
plt.title("Colorful Circos-style Chord Plot of Top Proteins", fontsize=14)
plt.show()


#######################################################

In [None]:
import pandas as pd

# STEP 1: Specify the file path
file_path = "/content/SA-C vs SA.xlsx"  # or full path if not in same folder

# STEP 2: Load the Excel file
excel_file = pd.ExcelFile(file_path)

# STEP 3: View available sheet names
print("Available sheets:", excel_file.sheet_names)

# STEP 4: Load each sheet into a dictionary of DataFrames
sheets = {sheet_name: excel_file.parse(sheet_name) for sheet_name in excel_file.sheet_names}

# Optional: Preview the first few rows of each sheet
for name, df in sheets.items():
    print(f"\nSheet: {name}")
    print(df.head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the DEGs sheet (assuming you already used the previous loading code)
df_degs = sheets['All-DEGs']  # or 'P-value-sig' for filtered

# Count categories
deg_counts = df_degs['category'].value_counts()

# Plot
plt.figure(figsize=(6, 4))
deg_counts.plot(kind='bar', color=['blue'])
plt.title("DEG Category Distribution")
plt.xlabel("Category")
plt.ylabel("Number of Genes")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import seaborn as sns

# Prepare data for volcano plot
df_degs = sheets['All-DEGs'].copy()  # Changed new_sheets to sheets
df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

# Define color for each category
category_colors = {
    'Upregulated': 'orange',       # dark blue
    'Downregulated': 'blue',     # medium blue
    'Nonsignificant': 'green'     # light blue (if present)
}
df_degs['color'] = df_degs['category'].map(category_colors)

# Plot volcano
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_degs, x='logFC', y='-log10(pvalue)', hue='category',
                palette=category_colors, edgecolor=None, alpha=0.7)

# Annotate reference lines
plt.axhline(-np.log10(0.05), linestyle='--', color='gray', linewidth=1)
plt.axvline(0, linestyle='--', color='gray', linewidth=1)

plt.title("Volcano Plot: SA-C vs SA", fontsize=14)
plt.xlabel("log2 Fold Change")
plt.ylabel("-log10(P-value)")
plt.legend(title='Category')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns

df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot: SA-C vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.show()

In [None]:
# Load GO term data
go_df = sheets['Count-of GO-terms'].copy() # Changed 'new_sheets' to 'sheets'

# Sort and filter top 10 per category
top_bp = go_df[go_df['Category'] == 'BP'].sort_values(by='Count', ascending=False).head(10)
top_mf = go_df[go_df['Category'] == 'MF'].sort_values(by='Count', ascending=False).head(10)
top_cc = go_df[go_df['Category'] == 'CC'].sort_values(by='Count', ascending=False).head(10)

# Define pastel color mapping
pastel_colors = {
    'BP': '#AEC6CF',  # pastel blue
    'MF': '#FFB6B6',  # pastel red
    'CC': '#B2E2B2'   # pastel green
}

# Plotting function
def plot_top_go_terms(df, category_name, color):
    plt.figure(figsize=(10, 6))
    plt.barh(df['Description'], df['Count'], color=color)
    plt.title(f"Top 10 GO Terms - {category_name}", fontsize=14)
    plt.xlabel("Gene Count")
    plt.ylabel("GO Term Description")
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=9)
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

# Plot all three categories
plot_top_go_terms(top_bp, 'Biological Process (BP)', pastel_colors['BP'])
plot_top_go_terms(top_mf, 'Molecular Function (MF)', pastel_colors['MF'])
plot_top_go_terms(top_cc, 'Cellular Component (CC)', pastel_colors['CC'])

In [None]:
# Filter for downregulated GO terms
# Assuming downregulated terms are already extracted (if not, this shows overall top 10 per category again)
# To get only downregulated GO terms, a dedicated sheet or filtered input is needed.
# Here, we reuse the same GO terms sheet assuming it represents downregulation if previously filtered as such.

# If there's a separate downregulated GO term source, replace go_df with that DataFrame.
# We'll use the same logic assuming it's downregulated.

down_go_df = go_df.copy()  # Replace with actual downregulated-only GO terms if available

# Sort and filter top 10 per category for downregulated terms
down_bp = down_go_df[down_go_df['Category'] == 'BP'].sort_values(by='Count', ascending=False).head(10)
down_mf = down_go_df[down_go_df['Category'] == 'MF'].sort_values(by='Count', ascending=False).head(10)
down_cc = down_go_df[down_go_df['Category'] == 'CC'].sort_values(by='Count', ascending=False).head(10)

# Plot all three categories with same pastel colors
plot_top_go_terms(down_bp, 'Downregulated - Biological Process (BP)', pastel_colors['BP'])
plot_top_go_terms(down_mf, 'Downregulated - Molecular Function (MF)', pastel_colors['MF'])
plot_top_go_terms(down_cc, 'Downregulated - Cellular Component (CC)', pastel_colors['CC'])


In [None]:
# Regenerate the heatmap with improved formatting
# Shorten labels for better fit and increase figure width

# Truncate long protein names for better display (50 characters max)
top20['label_short'] = top20['Protein names'].fillna(top20['gene_id']).str.slice(0, 50)

# Recreate heatmap data
heatmap_top20_short = top20.set_index('label_short')[['SA', 'SA-C']]
heatmap_log2_top20_short = np.log2(heatmap_top20_short + 1)

# Plot
plt.figure(figsize=(14, 12))
sns.heatmap(heatmap_log2_top20_short, cmap="BuGn", linewidths=0.4, linecolor='grey',
            cbar_kws={'label': 'log2(Expression + 1)'})
plt.title("Heatmap of Top 20 DEGs (Truncated Labels): SA vs SA-C", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Protein Name")
plt.tight_layout()
plt.show()


########################

In [None]:
import pandas as pd

# STEP 1: Specify the file path
file_path = "/content/SA-E vs SA.xlsx"  # or full path if not in same folder

# STEP 2: Load the Excel file
excel_file = pd.ExcelFile(file_path)

# STEP 3: View available sheet names
print("Available sheets:", excel_file.sheet_names)

# STEP 4: Load each sheet into a dictionary of DataFrames
sheets = {sheet_name: excel_file.parse(sheet_name) for sheet_name in excel_file.sheet_names}

# Optional: Preview the first few rows of each sheet
for name, df in sheets.items():
    print(f"\nSheet: {name}")
    print(df.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the DEGs sheet (assuming you already used the previous loading code)
df_degs = sheets['all-DEGs']  # or 'P-value-sig' for filtered. Changed 'All-DEGs' to 'all-DEGs'

# Count categories
deg_counts = df_degs['category'].value_counts()

# Plot
plt.figure(figsize=(6, 4))
deg_counts.plot(kind='bar', color=['blue'])
plt.title("DEG Category Distribution")
plt.xlabel("Category")
plt.ylabel("Number of Genes")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns

df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot: SA-E vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate -log10(p-value)
df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

# Define the desired color mapping
custom_palette = {
    'Upregulated': '#ff7f0e',
    'Downregulated': '#1f77b4',
    'Nonsignificant': '#2ca02c'
}

# Create the volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_degs,
    x='logFC',
    y='-log10(pvalue)',
    hue='category',
    palette=custom_palette,
    alpha=0.7
)

plt.title('Volcano Plot: SA-E vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend(title='Category')
plt.show()


In [None]:
# Re-import everything after environment reset
import pandas as pd
import matplotlib.pyplot as plt

# Reload the uploaded Excel file
file_path_e = "/content/SA-E vs SA.xlsx"
excel_file_e = pd.ExcelFile(file_path_e)

# Read all sheets
sheets_e = {sheet_name: excel_file_e.parse(sheet_name) for sheet_name in excel_file_e.sheet_names}

# Extract GO term sheet
go_sheet_name_e = [name for name in sheets_e if 'go' in name.lower()][0]
go_df_e = sheets_e[go_sheet_name_e]

# Get top 10 GO terms per category
bp_e = go_df_e[go_df_e['Category'] == 'BP'].sort_values(by='Count', ascending=False).head(10)
mf_e = go_df_e[go_df_e['Category'] == 'MF'].sort_values(by='Count', ascending=False).head(10)
cc_e = go_df_e[go_df_e['Category'] == 'CC'].sort_values(by='Count', ascending=False).head(10)

# Define pastel colors
pastel_colors = {
    'BP': '#AEC6CF',
    'MF': '#FFB6B6',
    'CC': '#B2E2B2'
}

# Plotting function
def plot_top_go_terms(df, category_name, color):
    plt.figure(figsize=(10, 6))
    plt.barh(df['Description'], df['Count'], color=color)
    plt.title(f"Top 10 GO Terms - {category_name}", fontsize=14)
    plt.xlabel("Gene Count")
    plt.ylabel("GO Term Description")
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=9)
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

# Plot all three GO categories
plot_top_go_terms(bp_e, 'Biological Process (BP) - SA-E vs SA', pastel_colors['BP'])
plot_top_go_terms(mf_e, 'Molecular Function (MF) - SA-E vs SA', pastel_colors['MF'])
plot_top_go_terms(cc_e, 'Cellular Component (CC) - SA-E vs SA', pastel_colors['CC'])


############

In [None]:
# Re-import all required libraries after environment reset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# File paths
file_p = "/content/SA-P vs SA.xlsx"
file_c = "/content/SA-C vs SA.xlsx"
file_e = "/content/SA-E vs SA.xlsx"

# Load annotated sheets from each file
df_p = pd.read_excel(file_p, sheet_name="Annotated-only")
df_c = pd.read_excel(file_c, sheet_name="Annotated-only")
df_e = pd.read_excel(file_e, sheet_name="Annotated-only")

# Create functional categories manually based on annotations
def categorize_function(row):
    text = str(row).lower()
    if "virulence" in text or "toxin" in text or "hemolysin" in text:
        return "Virulence"
    elif "transport" in text or "transporter" in text or "channel" in text:
        return "Transport"
    elif "metabolism" in text or "oxidase" in text or "dehydrogenase" in text or "synthetase" in text:
        return "Metabolism"
    elif "survival" in text or "stress" in text or "regulator" in text or "response" in text:
        return "Survival"
    elif "cell wall" in text or "membrane" in text or "capsule" in text:
        return "Cell Wall"
    else:
        return "Other"

# Apply function to each sheet
df_p['Function'] = df_p['Protein names'].apply(categorize_function)
df_c['Function'] = df_c['Protein names'].apply(categorize_function)
df_e['Function'] = df_e['Protein names'].apply(categorize_function)

# Count by category
count_p = df_p['Function'].value_counts().rename("SA-P vs SA")
count_c = df_c['Function'].value_counts().rename("SA-C vs SA")
count_e = df_e['Function'].value_counts().rename("SA-E vs SA")

# Combine into one DataFrame
combined = pd.concat([count_p, count_c, count_e], axis=1).fillna(0).astype(int)

# Reorder categories if needed
ordered = ['Virulence', 'Metabolism', 'Transport', 'Survival', 'Cell Wall', 'Other']
combined = combined.reindex(ordered)

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(combined, annot=True, cmap="YlGnBu", fmt="d", linewidths=0.5, linecolor='grey')
plt.title("Functional Category Enrichment across SA Comparisons")
plt.ylabel("Functional Category")
plt.xlabel("Condition")
plt.tight_layout()
plt.show()


In [None]:
# Update categorization to classify 'secretion' instead of grouping into 'Other'
def refine_function(row):
    text = str(row).lower()
    if "virulence" in text or "toxin" in text or "hemolysin" in text:
        return "Virulence"
    elif "transport" in text or "transporter" in text or "channel" in text:
        return "Transport"
    elif "metabolism" in text or "oxidase" in text or "dehydrogenase" in text or "synthetase" in text:
        return "Metabolism"
    elif "survival" in text or "stress" in text or "regulator" in text or "response" in text:
        return "Survival"
    elif "cell wall" in text or "membrane" in text or "capsule" in text:
        return "Cell Wall"
    elif "secretion" in text or "secreting" in text or "export" in text or "excretion" in text:
        return "Secretion"
    else:
        return "Other"

# Apply refined function
df_p['Function'] = df_p['Protein names'].apply(refine_function)
df_c['Function'] = df_c['Protein names'].apply(refine_function)
df_e['Function'] = df_e['Protein names'].apply(refine_function)

# Recompute counts
count_p = df_p['Function'].value_counts().rename("SA-P vs SA")
count_c = df_c['Function'].value_counts().rename("SA-C vs SA")
count_e = df_e['Function'].value_counts().rename("SA-E vs SA")

# Combine and reindex
combined = pd.concat([count_p, count_c, count_e], axis=1).fillna(0).astype(int)
refined_order = ['Virulence', 'Metabolism', 'Transport', 'Survival', 'Cell Wall', 'Secretion']
combined = combined.reindex(refined_order)

# Plot updated heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(combined, annot=True, cmap="YlGnBu", fmt="d", linewidths=0.5, linecolor='grey')
plt.title("Functional Category Enrichment (with Secretion) across SA Comparisons")
plt.ylabel("Functional Category")
plt.xlabel("Condition")
plt.tight_layout()
plt.show()


In [None]:
# Re-import necessary libraries after environment reset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reload pathway enrichment data from all three Excel files
path_p = pd.read_excel("/content/SA-P vs SA.xlsx", sheet_name="count-of-pathways")
path_c = pd.read_excel("/content/SA-C vs SA.xlsx", sheet_name="Pathway-count")
path_e = pd.read_excel("/content/SA-E vs SA.xlsx", sheet_name="count-of pathways")

# Label each comparison
path_p['Comparison'] = 'SA-P vs SA'
path_c['Comparison'] = 'SA-C vs SA'
path_e['Comparison'] = 'SA-E vs SA'

# Combine all datasets
path_all = pd.concat([path_p, path_c, path_e], ignore_index=True)

# Extract top 10 pathways per comparison
top_pathways = path_all.sort_values(['Comparison', 'Count'], ascending=[True, False])\
                       .groupby('Comparison').head(10)

# Plot top 10 enriched pathways for each comparison with viridis color palette
for comparison in top_pathways['Comparison'].unique():
    subset = top_pathways[top_pathways['Comparison'] == comparison]

    plt.figure(figsize=(12, 6))
    sns.barplot(data=subset, y='Pathway', x='Count', palette='viridis')
    plt.title(f"Top 10 Enriched Pathways - {comparison}", fontsize=14)
    plt.xlabel("Gene Count")
    plt.ylabel("Pathway Name")
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd

# Load the updated file
file_path = "/content/Pseudo_heat.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure
df.head()


In [None]:
# Replace NaNs with empty strings and convert to lowercase
df["GO_Lower"] = df["Gene Ontology (GO)"].fillna("").str.lower()

# Recalculate category scores
category_scores = {}
for category, keywords in categories.items():
    count = df["GO_Lower"].apply(lambda x: any(keyword in x for keyword in keywords)).sum()
    category_scores[category] = count

# Convert to DataFrame
heatmap_df = pd.DataFrame(category_scores, index=["SA-P"])

# Normalize scores to 0–1 scale
heatmap_df_normalized = heatmap_df.div(heatmap_df.max(axis=1).values[0])

# Plot heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(heatmap_df_normalized, cmap="Oranges", annot=True, fmt=".2f", cbar_kws={"label": "Normalized Score"})
plt.title("Functional Category Enrichment in S. aureus (SA-P vs SA)")
plt.ylabel("Condition")
plt.xlabel("Functional Category")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Load the updated file
file_path = "/content/Candida_heat.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure
df.head()

In [None]:
# Replace NaNs with empty strings and convert to lowercase
df["GO_Lower"] = df["Gene Ontology (GO)"].fillna("").str.lower()

# Recalculate category scores
category_scores = {}
for category, keywords in categories.items():
    count = df["GO_Lower"].apply(lambda x: any(keyword in x for keyword in keywords)).sum()
    category_scores[category] = count

# Convert to DataFrame
heatmap_df = pd.DataFrame(category_scores, index=["SA-C"])

# Normalize scores to 0–1 scale
heatmap_df_normalized = heatmap_df.div(heatmap_df.max(axis=1).values[0])

# Plot heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(heatmap_df_normalized, cmap="Oranges", annot=True, fmt=".2f", cbar_kws={"label": "Normalized Score"})
plt.title("Functional Category Enrichment in S. aureus (SA-C vs SA)")
plt.ylabel("Condition")
plt.xlabel("Functional Category")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Load the updated file
file_path = "/content/Entero_heat.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure
df.head()

In [None]:
# Replace NaNs with empty strings and convert to lowercase
df["GO_Lower"] = df["Gene Ontology (GO)"].fillna("").str.lower()

# Recalculate category scores
category_scores = {}
for category, keywords in categories.items():
    count = df["GO_Lower"].apply(lambda x: any(keyword in x for keyword in keywords)).sum()
    category_scores[category] = count

# Convert to DataFrame
heatmap_df = pd.DataFrame(category_scores, index=["SA-E"])

# Normalize scores to 0–1 scale
heatmap_df_normalized = heatmap_df.div(heatmap_df.max(axis=1).values[0])

# Plot heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(heatmap_df_normalized, cmap="Oranges", annot=True, fmt=".2f", cbar_kws={"label": "Normalized Score"})
plt.title("Functional Category Enrichment in S. aureus (SA-E vs SA)")
plt.ylabel("Condition")
plt.xlabel("Functional Category")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the Excel file
file_path = '/content/SAP.xlsx'
xls = pd.ExcelFile(file_path)

# Parse the first sheet
df = xls.parse(xls.sheet_names[0])

# Clean the data
df = df[['Gene Names', 'category', 'logFC']].copy()
df['Gene Names'] = df['Gene Names'].fillna(method='ffill')  # Fill down missing gene names
df = df.dropna(subset=['Gene Names', 'logFC', 'category'])  # Drop rows with any remaining missing values

# Sort data by logFC
df = df.sort_values(by='logFC', ascending=False).reset_index(drop=True)

# Prepare for circular plot
N = len(df)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df['logFC'].tolist()
colors = ['green' if cat == 'Upregulated' else 'red' for cat in df['category']]

# Plot
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Label setup
ax.set_xticks(angles)
ax.set_xticklabels(df['Gene Names'], fontsize=7, rotation=90)

# Remove radial grid and axis
ax.set_yticklabels([])
ax.set_yticks([])
ax.spines['polar'].set_visible(False)

plt.title('Circular Barplot: Gene Expression (Up vs Downregulated)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
# Re-plot with gene name labels adjusted for readability
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})

# Plot bars
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Add labels with rotated angle and spacing to avoid overlapping
for angle, label in zip(angles, df['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'left' if np.pi/2 <= angle <= 3*np.pi/2 else 'right'
    ax.text(
        angle,
        max(values) + 0.1,  # position label slightly outside
        label,
        rotation=rotation if rotation <= 180 else rotation - 180,
        rotation_mode='anchor',
        ha=alignment,
        va='center',
        fontsize=7
    )

# Remove y-ticks and radial axis
ax.set_yticklabels([])
ax.set_yticks([])
ax.spines['polar'].set_visible(False)
ax.set_xticks([])

plt.title('Circular Barplot: Gene Expression (Adjusted Labels)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
# Use pastel colors for a more publication-friendly appearance
from matplotlib.colors import to_rgba

# Define pastel green and pastel red
pastel_green = to_rgba("#A8E6CF")  # light green
pastel_red = to_rgba("#FF8C94")    # light red

# Assign pastel colors based on regulation category
pastel_colors = [pastel_green if cat == 'Upregulated' else pastel_red for cat in df['category']]

# Plot with pastel colors
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})

# Draw bars
bars = ax.bar(angles, values, width=0.35, color=pastel_colors, edgecolor='black')

# Label setup for readability
for angle, label in zip(angles, df['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'left' if np.pi/2 <= angle <= 3*np.pi/2 else 'right'
    ax.text(
        angle,
        max(values) + 0.1,
        label,
        rotation=rotation if rotation <= 180 else rotation - 180,
        rotation_mode='anchor',
        ha=alignment,
        va='center',
        fontsize=7
    )

# Cleanup plot aesthetics
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)

plt.title('Circular Barplot: Gene Expression (Pastel Colors)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
# Improve label positioning by spacing them evenly with adjusted radius
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})

# Draw bars with pastel colors
bars = ax.bar(angles, values, width=0.35, color=pastel_colors, edgecolor='black')

# Label placement with uniform spacing and outer radius positioning
label_radius = max(values) + 0.3  # move labels further out
for angle, label in zip(angles, df['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'right' if np.pi/2 < angle < 3*np.pi/2 else 'left'
    rotation_adj = rotation + 180 if alignment == 'right' else rotation

    ax.text(
        angle,
        label_radius,
        label,
        rotation=rotation_adj,
        rotation_mode='anchor',
        ha=alignment,
        va='center',
        fontsize=7,
        clip_on=False
    )

# Remove axis elements
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)

plt.title('Circular Barplot: Uniform Labels (Pastel Colors)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the Excel file
file_path = '/content/SAC.xlsx'
xls = pd.ExcelFile(file_path)

# Parse the first sheet
df = xls.parse(xls.sheet_names[0])

# Clean the data
df = df[['Gene Names', 'category', 'logFC']].copy()
df['Gene Names'] = df['Gene Names'].fillna(method='ffill')  # Fill down missing gene names
df = df.dropna(subset=['Gene Names', 'logFC', 'category'])  # Drop rows with any remaining missing values

# Sort data by logFC
df = df.sort_values(by='logFC', ascending=False).reset_index(drop=True)

# Prepare for circular plot
N = len(df)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df['logFC'].tolist()
colors = ['green' if cat == 'Upregulated' else 'red' for cat in df['category']]

# Plot
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Label setup
ax.set_xticks(angles)
ax.set_xticklabels(df['Gene Names'], fontsize=7, rotation=90)

# Remove radial grid and axis
ax.set_yticklabels([])
ax.set_yticks([])
ax.spines['polar'].set_visible(False)

plt.title('Circular Barplot: Gene Expression (Up vs Downregulated)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the Excel file
file_path = '/content/SAC.xlsx'
xls = pd.ExcelFile(file_path)
df = xls.parse(xls.sheet_names[0])

# Clean and sort the data
df = df[['Gene Names', 'category', 'logFC']].copy()
df['Gene Names'] = df['Gene Names'].fillna(method='ffill')
df = df.dropna(subset=['Gene Names', 'logFC', 'category'])
df = df.sort_values(by='logFC', ascending=False).reset_index(drop=True)

# Prepare for circular plot
N = len(df)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df['logFC'].tolist()
colors = ['green' if cat == 'Upregulated' else 'red' for cat in df['category']]

# Plot
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Label placement: rotate and align text outward
label_radius = max(values) + 0.5
for angle, label in zip(angles, df['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'right' if np.pi/2 < angle < 3*np.pi/2 else 'left'
    rotation_adj = rotation + 180 if alignment == 'right' else rotation
    ax.text(angle, label_radius, label,
            rotation=rotation_adj,
            rotation_mode='anchor',
            ha=alignment, va='center',
            fontsize=6, clip_on=False)

# Aesthetics
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)
plt.title('Circular Barplot: Gene Expression (Up vs Downregulated)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
# Take only the top 240 genes by absolute logFC (120 upregulated + 120 downregulated)
top_up = df[df['category'] == 'Upregulated'].nlargest(120, 'logFC')
top_down = df[df['category'] == 'Downregulated'].nsmallest(120, 'logFC')
df_240 = pd.concat([top_up, top_down]).reset_index(drop=True)

# Prepare data
N = len(df_240)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df_240['logFC'].tolist()

# Use pastel colors
from matplotlib.colors import to_rgba
pastel_green = to_rgba("#A8E6CF")  # pastel green
pastel_red = to_rgba("#FF8C94")    # pastel red
colors_240 = [pastel_green if cat == 'Upregulated' else pastel_red for cat in df_240['category']]

# Plot
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors_240, edgecolor='black')

# Label placement with uniform spacing
label_radius = max(values) + 0.5
for angle, label in zip(angles, df_240['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'right' if np.pi/2 < angle < 3*np.pi/2 else 'left'
    rotation_adj = rotation + 180 if alignment == 'right' else rotation
    ax.text(angle, label_radius, label,
            rotation=rotation_adj,
            rotation_mode='anchor',
            ha=alignment, va='center',
            fontsize=6, clip_on=False)

# Clean aesthetics
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)
plt.tight_layout()
plt.show()


In [None]:
# Calculate 50% of upregulated and downregulated gene counts
n_up = int(len(df[df['category'] == 'Upregulated']) * 0.5)
n_down = int(len(df[df['category'] == 'Downregulated']) * 0.5)

# Take top 80% of each category by logFC
top_up = df[df['category'] == 'Upregulated'].nlargest(n_up, 'logFC')
top_down = df[df['category'] == 'Downregulated'].nsmallest(n_down, 'logFC')

# Combine into one dataframe
df_subset = pd.concat([top_up, top_down]).reset_index(drop=True)

# Prepare for circular plot
N = len(df_subset)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df_subset['logFC'].tolist()

# Use pastel colors
from matplotlib.colors import to_rgba
pastel_green = to_rgba("#A8E6CF")
pastel_red = to_rgba("#FF8C94")
colors = [pastel_green if cat == 'Upregulated' else pastel_red for cat in df_subset['category']]

# Plot
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Uniform label placement
label_radius = max(values) + 0.5
for angle, label in zip(angles, df_subset['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'right' if np.pi/2 < angle < 3*np.pi/2 else 'left'
    rotation_adj = rotation + 180 if alignment == 'right' else rotation
    ax.text(angle, label_radius, label,
            rotation=rotation_adj,
            rotation_mode='anchor',
            ha=alignment, va='center',
            fontsize=6, clip_on=False)

# Clean aesthetics
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the Excel file
file_path = '/content/SAE.xlsx'
xls = pd.ExcelFile(file_path)
df = xls.parse(xls.sheet_names[0])

# Clean and sort the data
df = df[['Gene Names', 'category', 'logFC']].copy()
df['Gene Names'] = df['Gene Names'].fillna(method='ffill')
df = df.dropna(subset=['Gene Names', 'logFC', 'category'])
df = df.sort_values(by='logFC', ascending=False).reset_index(drop=True)

# Prepare for circular plot
N = len(df)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df['logFC'].tolist()
colors = ['green' if cat == 'Upregulated' else 'red' for cat in df['category']]

# Plot
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Label placement: rotate and align text outward
label_radius = max(values) + 0.5
for angle, label in zip(angles, df['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'right' if np.pi/2 < angle < 3*np.pi/2 else 'left'
    rotation_adj = rotation + 180 if alignment == 'right' else rotation
    ax.text(angle, label_radius, label,
            rotation=rotation_adj,
            rotation_mode='anchor',
            ha=alignment, va='center',
            fontsize=6, clip_on=False)

# Aesthetics
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)
plt.title('Circular Barplot: Gene Expression (Up vs Downregulated)', y=1.08)
plt.tight_layout()
plt.show()


In [None]:
# Calculate 80% of upregulated and downregulated gene counts
n_up = int(len(df[df['category'] == 'Upregulated']) * 0.5)
n_down = int(len(df[df['category'] == 'Downregulated']) * 0.5)

# Take top 80% of each category by logFC
top_up = df[df['category'] == 'Upregulated'].nlargest(n_up, 'logFC')
top_down = df[df['category'] == 'Downregulated'].nsmallest(n_down, 'logFC')

# Combine into one dataframe
df_subset = pd.concat([top_up, top_down]).reset_index(drop=True)

# Prepare for circular plot
N = len(df_subset)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
values = df_subset['logFC'].tolist()

# Use pastel colors
from matplotlib.colors import to_rgba
pastel_green = to_rgba("#A8E6CF")
pastel_red = to_rgba("#FF8C94")
colors = [pastel_green if cat == 'Upregulated' else pastel_red for cat in df_subset['category']]

# Plot
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})
bars = ax.bar(angles, values, width=0.35, color=colors, edgecolor='black')

# Uniform label placement
label_radius = max(values) + 0.5
for angle, label in zip(angles, df_subset['Gene Names']):
    rotation = np.degrees(angle)
    alignment = 'right' if np.pi/2 < angle < 3*np.pi/2 else 'left'
    rotation_adj = rotation + 180 if alignment == 'right' else rotation
    ax.text(angle, label_radius, label,
            rotation=rotation_adj,
            rotation_mode='anchor',
            ha=alignment, va='center',
            fontsize=6, clip_on=False)

# Clean aesthetics
ax.set_yticklabels([])
ax.set_yticks([])
ax.set_xticks([])
ax.spines['polar'].set_visible(False)
plt.tight_layout()
plt.show()


In [None]:
# Re-import libraries after code execution environment reset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Reload the file
file_path = '/content/SAP.xlsx'
xls = pd.ExcelFile(file_path)

# Load the first sheet
df_heatmap = xls.parse(xls.sheet_names[0])

# Show column names
df_heatmap.columns


In [None]:
# Prepare data: subset with gene names and logFC
heatmap_data = df_heatmap[['Gene Names', 'logFC']].dropna()
heatmap_data = heatmap_data.drop_duplicates(subset='Gene Names')  # ensure unique gene names

# Set gene names as index
heatmap_data.set_index('Gene Names', inplace=True)

# Normalize logFC to fit heatmap range if needed
plt.figure(figsize=(8, 20))
sns.heatmap(heatmap_data, cmap='RdYlGn', linewidths=0.2, annot=False, cbar_kws={'label': 'log₂ Fold Change'})
plt.title('Gene Expression Heatmap (logFC)')
plt.xlabel('logFC')
plt.ylabel('Gene Names')
plt.tight_layout()
plt.show()


In [None]:
# Re-import libraries after code execution environment reset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Reload the file
file_path = '/content/SAC.xlsx'
xls = pd.ExcelFile(file_path)

# Load the first sheet
df_heatmap = xls.parse(xls.sheet_names[0])

In [None]:
# Prepare data: subset with gene names and logFC
heatmap_data = df_heatmap[['Gene Names', 'logFC']].dropna()
heatmap_data = heatmap_data.drop_duplicates(subset='Gene Names')  # ensure unique gene names

# Set gene names as index
heatmap_data.set_index('Gene Names', inplace=True)

# Normalize logFC to fit heatmap range if needed
plt.figure(figsize=(8, 20))
sns.heatmap(heatmap_data, cmap='RdYlGn', linewidths=0.2, annot=False, cbar_kws={'label': 'log₂ Fold Change'})
plt.title('Gene Expression Heatmap (logFC)')
plt.xlabel('logFC')
plt.ylabel('Gene Names')
plt.tight_layout()
plt.show()

In [None]:
# Re-import libraries after code execution environment reset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Reload the file
file_path = '/content/SAE.xlsx'
xls = pd.ExcelFile(file_path)

# Load the first sheet
df_heatmap = xls.parse(xls.sheet_names[0])

# Show column names
df_heatmap.columns


In [None]:
# Prepare data: subset with gene names and logFC
heatmap_data = df_heatmap[['Gene Names', 'logFC']].dropna()
heatmap_data = heatmap_data.drop_duplicates(subset='Gene Names')  # ensure unique gene names

# Set gene names as index
heatmap_data.set_index('Gene Names', inplace=True)

# Normalize logFC to fit heatmap range if needed
plt.figure(figsize=(8, 20))
sns.heatmap(heatmap_data, cmap='RdYlGn', linewidths=0.2, annot=False, cbar_kws={'label': 'log₂ Fold Change'})
plt.title('Gene Expression Heatmap (logFC)')
plt.xlabel('logFC')
plt.ylabel('Gene Names')
plt.tight_layout()
plt.show()

In [None]:
# Install required library
# pip install umap-learn

import pandas as pd
import umap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load your Excel file
df = pd.read_excel("/content/SAP.xlsx")

# Preprocessing
df = df[['logFC', 'Pvalue', 'category']].dropna()
scaler = StandardScaler()
scaled = scaler.fit_transform(df[['logFC', 'Pvalue']])

# Run UMAP
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(scaled)

# Add UMAP results to DataFrame
df['UMAP1'] = embedding[:, 0]
df['UMAP2'] = embedding[:, 1]

# Custom color map for Up/Down regulation
custom_palette = {'Upregulated': 'green', 'Downregulated': 'orange'}

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='category', palette=custom_palette)
plt.title("UMAP Projection of Gene Expression")
plt.tight_layout()
plt.show()



In [None]:
# Install required library
# pip install umap-learn

import pandas as pd
import umap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load your Excel file
df = pd.read_excel("/content/SAC.xlsx")

# Preprocessing
df = df[['logFC', 'Pvalue', 'category']].dropna()
scaler = StandardScaler()
scaled = scaler.fit_transform(df[['logFC', 'Pvalue']])

# Run UMAP
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(scaled)

# Add UMAP results to DataFrame
df['UMAP1'] = embedding[:, 0]
df['UMAP2'] = embedding[:, 1]

# Custom color map for Up/Down regulation
custom_palette = {'Upregulated': 'green', 'Downregulated': 'orange'}

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='category', palette=custom_palette)
plt.title("UMAP Projection of Gene Expression")
plt.tight_layout()
plt.show()



In [None]:
# Install required library
# pip install umap-learn

import pandas as pd
import umap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load your Excel file
df = pd.read_excel("/content/SAE.xlsx")

# Preprocessing
df = df[['logFC', 'Pvalue', 'category']].dropna()
scaler = StandardScaler()
scaled = scaler.fit_transform(df[['logFC', 'Pvalue']])

# Run UMAP
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(scaled)

# Add UMAP results to DataFrame
df['UMAP1'] = embedding[:, 0]
df['UMAP2'] = embedding[:, 1]

# Custom color map for Up/Down regulation
custom_palette = {'Upregulated': 'green', 'Downregulated': 'orange'}

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='category', palette=custom_palette)
plt.title("UMAP Projection of Gene Expression")
plt.tight_layout()
plt.show()

Staph only for SA_C

In [None]:
import pandas as pd

file_path = "/content/SA-C vs SA.xlsx"
xls = pd.ExcelFile(file_path)

# Load all sheets into dictionary
sheets = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}

In [None]:
import matplotlib.pyplot as plt

df_degs = sheets['All-DEGs']

# Count categories (Up, Down, Non-significant)
category_counts = df_degs['category'].value_counts()

# Plot bar chart
category_counts.plot(kind='bar', title='DEG Category Distribution', xlabel='Category', ylabel='Count')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns

df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot: SA-C vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.show()

In [None]:
# Load the 'Annotated-only' sheet
df_annotated = sheets['Annotated-only']

# Display some rows to check available annotation information
df_annotated.head()

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter for only Staphylococcus aureus genes from the annotated sheet
sa_annotated = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]

# Step 2: Get the list of S. aureus gene_ids
sa_gene_ids = set(sa_annotated['gene_id'])

# Step 3: Filter the original DEG dataframe to include only S. aureus genes
df_sa_degs = sheets['All-DEGs'][sheets['All-DEGs']['gene_id'].isin(sa_gene_ids)].copy()

# Step 4: Calculate -log10(pvalue) for volcano plot
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Step 5: Plot volcano plot for S. aureus only
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_sa_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot (S. aureus only): SA-C vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Define color map and marker settings for pastel colors and round points
palette = {
    'Upregulated': 'green',
    'Downregulated': 'salmon',
    'Nonsignificant': 'blue'
}

# Recalculate the '-log10(pvalue)' column (in case it needs update)
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Create pastel volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_sa_degs,
    x='logFC',
    y='-log10(pvalue)',
    hue='category',
    palette=palette,
    style='category',
    markers=['o', 'o', 'o'],
    edgecolor='none',
    s=50,
    alpha=0.7
)
plt.axvline(0, linestyle='--', color='gray')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot (S. aureus only): SA-C vs SA')
plt.legend(title='Category')
plt.tight_layout()
plt.show()


In [None]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt

# Identify gene sets for the three categories
upregulated_genes = set(df_sa_degs[df_sa_degs['category'] == 'Upregulated']['gene_id'])
downregulated_genes = set(df_sa_degs[df_sa_degs['category'] == 'Downregulated']['gene_id'])
nonsignificant_genes = set(df_sa_degs[df_sa_degs['category'] == 'Nonsignificant']['gene_id'])

# Create Venn diagram
plt.figure(figsize=(6, 6))
venn3(
    [upregulated_genes, downregulated_genes, nonsignificant_genes],
    set_labels=('Upregulated', 'Downregulated', 'Nonsignificant')
)
plt.title('Venn Diagram of S. aureus SA_C DEG Categories')
plt.show()


In [None]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt

# Define updated pastel colors for each category
venn_colors = ['#A8E6CF', '#FFB3BA', '#B3CDE3']  # pastel green, pink, blue

# Create Venn diagram with custom colors
plt.figure(figsize=(6, 6))
venn3(
    [upregulated_genes, downregulated_genes, nonsignificant_genes],
    set_labels=('Upregulated', 'Downregulated', 'Nonsignificant'),
    set_colors=venn_colors,
    alpha=0.7
)
plt.title('Venn Diagram of S. aureus only DEG Categories SA_C (Pastel)')
plt.show()


In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Count the number of genes in each category
category_counts = pd.Series({
    'Upregulated': len(upregulated_genes),
    'Downregulated': len(downregulated_genes),
    'Nonsignificant': len(nonsignificant_genes)
}).reset_index()

category_counts.columns = ['Category', 'Count']

# Create the bar plot
plt.figure(figsize=(6, 5))
sns.barplot(data=category_counts, x='Category', y='Count', palette=venn_colors)
plt.title('DEG Category Distribution (S. aureus only, SA_C)')
plt.ylabel('Gene Count')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

file_path = "/content/SA-C vs SA.xlsx"
xls = pd.ExcelFile(file_path)

# Load all sheets into dictionary
sheets = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}
# Load the KEGG pathway counts sheet
df_kegg = sheets['Pathway-count']

# Show the top entries sorted by DEG count (assuming a relevant column is present)
df_kegg_sorted = df_kegg.sort_values(by=df_kegg.columns[1], ascending=False).head(10)

import matplotlib.pyplot as plt
import seaborn as sns

# Plot top KEGG pathways (assuming first column is pathway name, second is DEG count)
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_kegg_sorted,
    x=df_kegg_sorted.columns[1],
    y=df_kegg_sorted.columns[0],
    palette='pastel'
)
plt.xlabel('Gene Count')
plt.ylabel('KEGG Pathway')
plt.title('Top 10 KEGG Pathways (S. aureus SA_C DEGs)')
plt.tight_layout()
plt.show()


In [None]:
# Map Trinity gene_id to Gene Names (primary) using the Annotated-only sheet

# Create mapping dictionary
gene_name_map = df_annotated.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()

# Apply mapping to replace gene_id with gene name
df_named_degs = df_sa_degs.copy()
df_named_degs['gene_name'] = df_named_degs['gene_id'].map(gene_name_map)

# Drop entries without gene names
df_named_degs = df_named_degs.dropna(subset=['gene_name'])

# Use only Upregulated and Downregulated categories
df_named_filtered = df_named_degs[df_named_degs['category'].isin(['Upregulated', 'Downregulated'])]

# Prepare data
up_genes = df_named_filtered[df_named_filtered['category'] == 'Upregulated']['gene_name'].tolist()
down_genes = df_named_filtered[df_named_filtered['category'] == 'Downregulated']['gene_name'].tolist()

# Combine for plotting
all_genes = up_genes + down_genes
categories = ['Upregulated'] * len(up_genes) + ['Downregulated'] * len(down_genes)

# Assign circular positions
total = len(all_genes)
angles = np.linspace(0, 2 * np.pi, total, endpoint=False)
positions = {gene: (np.cos(a), np.sin(a)) for gene, a in zip(all_genes, angles)}

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect('equal')
ax.axis('off')

# Draw center nodes for categories
ax.plot(0, 0.2, 'o', markersize=15, color='lightgreen')
ax.text(0, 0.27, 'Upregulated', ha='center', fontsize=12)

ax.plot(0, -0.2, 'o', markersize=15, color='lightsalmon')
ax.text(0, -0.27, 'Downregulated', ha='center', fontsize=12)

# Draw gene nodes and connecting lines
for gene, cat in zip(all_genes, categories):
    x, y = positions[gene]
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'
    center_y = 0.2 if cat == 'Upregulated' else -0.2

    ax.plot(x, y, 'o', markersize=6, color=color, alpha=0.8)
    ax.plot([0, x], [center_y, y], color=color, alpha=0.5, linewidth=1)
    ax.text(x, y, gene, fontsize=6, ha='center', va='center')

plt.title("Chord-style Circular Plot of Upregulated and Downregulated Genes (Gene Names)", fontsize=14)
plt.show()


In [None]:
# For a cleaner layout, we'll sort genes by category and arrange them symmetrically

# Combine and sort
df_named_filtered_sorted = df_named_filtered.sort_values(by='category')
sorted_genes = df_named_filtered_sorted['gene_name'].tolist()
sorted_categories = df_named_filtered_sorted['category'].tolist()

# Compute symmetric circular positions
total = len(sorted_genes)
angles = np.linspace(0, 2 * np.pi, total, endpoint=False)
positions = {gene: (np.cos(a), np.sin(a)) for gene, a in zip(sorted_genes, angles)}

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect('equal')
ax.axis('off')

# Category node positions
category_centers = {
    'Upregulated': (0, 0.4),
    'Downregulated': (0, -0.4)
}

# Draw central category nodes
for cat, (x, y) in category_centers.items():
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'
    ax.plot(x, y, 'o', markersize=15, color=color)
    ax.text(x, y + 0.1 if cat == 'Upregulated' else y - 0.1, cat, ha='center', fontsize=12)

# Draw gene nodes and connecting lines
for gene, cat in zip(sorted_genes, sorted_categories):
    x, y = positions[gene]
    cx, cy = category_centers[cat]
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'

    ax.plot(x, y, 'o', markersize=6, color=color, alpha=0.8)
    ax.plot([cx, x], [cy, y], color=color, alpha=0.6, linewidth=1)
    ax.text(x, y, gene, fontsize=6, ha='center', va='center')

plt.title("Refined Circular Chord-style Plot (Gene Names)", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Create a circular barplot using gene names and category (like user image)

# Assign a numeric value to each gene (e.g., fold change or 1 for visual purposes)
df_named_filtered_sorted['value'] = 1  # uniform bar height

# Use different colors for up and downregulated genes
bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': 'lightgreen',
    'Downregulated': 'lightsalmon'
}).tolist()

# Plot
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)
ax.set_title("Circular Barplot: Upregulated and Downregulated Genes (Pastel Colors)", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Use pastel pink for downregulated and pastel green for upregulated genes
pastel_bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}).tolist()

# Redraw the circular barplot with updated colors
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=pastel_bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)

plt.tight_layout()
plt.show()

In [None]:
# Step 1: Filter annotated genes for S. aureus only
sa_annotated = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]

# Step 2: Map gene_id to primary gene name for S. aureus only
sa_gene_name_map = sa_annotated.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()

# Step 3: Filter All-DEGs to keep only S. aureus gene IDs
df_sa_only = sheets['All-DEGs'][sheets['All-DEGs']['gene_id'].isin(sa_gene_name_map.keys())].copy()

# Step 4: Add gene name column
df_sa_only['gene_name'] = df_sa_only['gene_id'].map(sa_gene_name_map)

# Step 5: Keep only rows with gene names and relevant categories
df_sa_only_filtered = df_sa_only[
    df_sa_only['category'].isin(['Upregulated', 'Downregulated'])
].dropna(subset=['gene_name'])

# Step 6: Prepare for heatmap
df_sa_only_filtered['value'] = 1  # for barplot if needed
heatmap_data = df_sa_only_filtered[['gene_name', 'logFC', 'category']].copy()
heatmap_data = heatmap_data.set_index('gene_name')
heatmap_data = heatmap_data.sort_values(by=['category', 'logFC'])

# Create color-mapped heatmap
row_colors = heatmap_data['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
})
heatmap_values = heatmap_data.drop(columns='category')

# Plot
plt.figure(figsize=(6, 14))
sns.heatmap(
    heatmap_values,
    cmap='vlag',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus Differentially Expressed Genes (log2FC)")
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()


In [None]:
# Plot with a blue-green-yellow colormap
plt.figure(figsize=(6, 14))
sns.heatmap(
    heatmap_values,
    cmap='YlGnBu',  # Blue-Green-Yellow hue
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus DEGs (Blue-Green-Yellow Color Scale)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Extract expression values for PCA from 'All-DEGs' for S. aureus only
# Using original counts or transformed values (SA and SA-C columns)
df_sa_pca = df_sa_only[['gene_id', 'SA', 'SA-C']].copy()
df_sa_pca = df_sa_pca.dropna()

# Rename columns for clarity
df_sa_pca.columns = ['gene_id', 'SA', 'SA_C']

# Normalize the data
X = df_sa_pca[['SA', 'SA_C']]
X_scaled = StandardScaler().fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

# Create DataFrame with PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['gene_id'] = df_sa_pca['gene_id'].values

# Plot PCA
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', alpha=0.7)
plt.title('PCA of S. aureus Gene Expression (SA vs SA-C)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Merge category information into PCA dataframe
gene_category_map = df_sa_only_filtered.set_index('gene_id')['category'].to_dict()
pca_df['category'] = pca_df['gene_id'].map(gene_category_map)

# Plot PCA with color by DEG category
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='category', palette={
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}, alpha=0.8)

plt.title('PCA of S. aureus Gene Expression (Colored by DEG Category)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.legend(title='Category')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Install required library
# !pip install umap-learn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import umap

# Load your Excel file
file_path = "/content/SA-C vs SA.xlsx"
xls = pd.ExcelFile(file_path)

df_all_degs = xls.parse('All-DEGs')
df_annotated = xls.parse('Annotated-only')

# Filter for Staphylococcus aureus genes
df_annotated_sa = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]
gene_map = df_annotated_sa.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()
df_sa_only = df_all_degs[df_all_degs['gene_id'].isin(gene_map.keys())].copy()
df_sa_only['gene_name'] = df_sa_only['gene_id'].map(gene_map)

# Only up and down regulated genes
df_sa_filtered = df_sa_only[df_sa_only['category'].isin(['Upregulated', 'Downregulated'])]
df_sa_filtered = df_sa_filtered.dropna(subset=['gene_name'])

# Prepare data for UMAP
X = df_sa_filtered[['SA', 'SA-C']]
X_scaled = StandardScaler().fit_transform(X)

# Run UMAP
umap_model = umap.UMAP(random_state=42)
umap_result = umap_model.fit_transform(X_scaled)

# Prepare UMAP plot dataframe
df_umap = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
df_umap['gene_name'] = df_sa_filtered['gene_name'].values
df_umap['category'] = df_sa_filtered['category'].values

# Plot UMAP
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_umap,
    x='UMAP1',
    y='UMAP2',
    hue='category',
    palette={
        'Upregulated': '#A8E6CF',
        'Downregulated': '#FFB3BA'
    },
    style='category',
    markers=['o', 'o'],
    s=50,
    alpha=0.8
)
plt.title("UMAP of S. aureus Gene Expression")
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.legend(title='Category')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# !pip install plotly pandas openpyxl

import plotly.graph_objects as go
import pandas as pd

# Load Excel file
xls = pd.ExcelFile("/content/SA-C vs SA.xlsx")
df_degs = xls.parse("All-DEGs")
df_annot = xls.parse("Annotated-only")

# Filter for S. aureus genes
df_annot_sa = df_annot[df_annot['Organism'].str.contains('Staphylococcus aureus', na=False)]
gene_map = df_annot_sa.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()
df_sa = df_degs[df_degs['gene_id'].isin(gene_map.keys())].copy()
df_sa['gene_name'] = df_sa['gene_id'].map(gene_map)

# Keep only Up/Down regulated
df_sankey = df_sa[df_sa['category'].isin(['Upregulated', 'Downregulated'])]
df_sankey = df_sankey.dropna(subset=['gene_name'])

# Take top 20 genes by absolute logFC
top_genes = df_sankey.reindex(df_sankey['logFC'].abs().sort_values(ascending=False).index).head(20)
gene_labels = top_genes['gene_name'].tolist()
categories = top_genes['category'].tolist()

# Build Sankey structure
nodes = list(set(gene_labels + categories))
node_indices = {node: i for i, node in enumerate(nodes)}
sources = [node_indices[gene] for gene in gene_labels]
targets = [node_indices[cat] for cat in categories]
values = [1] * len(gene_labels)

# Define pastel colors
node_colors = []
for node in nodes:
    if node == 'Upregulated':
        node_colors.append('#A8E6CF')  # pastel green
    elif node == 'Downregulated':
        node_colors.append('#FFB3BA')  # pastel pink
    else:
        node_colors.append('lightgrey')  # gene nodes

# Sankey plot
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=15,
        line=dict(color="black", width=0.5),
        label=nodes,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color="rgba(160,160,160,0.4)"
    )
)])

fig.update_layout(title_text="Sankey Diagram: Gene Expression Categories", font_size=12)
fig.show()


SA_E staph only

In [None]:
import pandas as pd

file_path = "/content/SA-E vs SA.xlsx"
xls = pd.ExcelFile(file_path)

# Load all sheets into dictionary
sheets = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}


In [None]:
import matplotlib.pyplot as plt

df_degs = sheets['all-DEGs']

# Count categories (Up, Down, Non-significant)
category_counts = df_degs['category'].value_counts()

# Plot bar chart
category_counts.plot(kind='bar', title='DEG Category Distribution', xlabel='Category', ylabel='Count')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns

df_degs['-log10(pvalue)'] = -np.log10(df_degs['Pvalue'])

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot: SA-E vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.show()

In [None]:
# Load the 'Annotated-only' sheet
df_annotated = sheets['Annotated-only']

# Display some rows to check available annotation information
df_annotated.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter for only Staphylococcus aureus genes from the annotated sheet
sa_annotated = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]

# Step 2: Get the list of S. aureus gene_ids
sa_gene_ids = set(sa_annotated['gene_id'])

# Step 3: Filter the original DEG dataframe to include only S. aureus genes
df_sa_degs = sheets['all-DEGs'][sheets['all-DEGs']['gene_id'].isin(sa_gene_ids)].copy()

# Step 4: Calculate -log10(pvalue) for volcano plot
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Step 5: Plot volcano plot for S. aureus only
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_sa_degs, x='logFC', y='-log10(pvalue)', hue='category', alpha=0.7)
plt.title('Volcano Plot (S. aureus only): SA-E vs SA')
plt.axvline(0, linestyle='--', color='grey')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Define color map and marker settings for pastel colors and round points
palette = {
    'Upregulated': 'green',
    'Downregulated': 'salmon',
    'Nonsignificant': 'blue'
}

# Recalculate the '-log10(pvalue)' column (in case it needs update)
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Create pastel volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_sa_degs,
    x='logFC',
    y='-log10(pvalue)',
    hue='category',
    palette=palette,
    style='category',
    markers=['o', 'o', 'o'],
    edgecolor='none',
    s=50,
    alpha=0.7
)
plt.axvline(0, linestyle='--', color='gray')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot (S. aureus only): SA-E vs SA')
plt.legend(title='Category')
plt.tight_layout()
plt.show()


In [None]:
# Define color map and marker settings for pastel colors and round points
palette = {
    'Upregulated': 'green',
    'Downregulated': 'salmon',
    'Nonsignificant': 'blue'
}

# Recalculate the '-log10(pvalue)' column (in case it needs update)
df_sa_degs['-log10(pvalue)'] = -np.log10(df_sa_degs['Pvalue'])

# Create pastel volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_sa_degs,
    x='logFC',
    y='-log10(pvalue)',
    hue='category',
    palette=palette,
    style='category',
    markers=['o', 'o', 'o'],
    edgecolor='none',
    s=50,
    alpha=0.7
)
plt.axvline(0, linestyle='--', color='gray')
plt.xlabel('log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot (S. aureus only): SA-E vs SA')
plt.legend(title='Category')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

# Load Excel File
file_path = "/content/SA-E vs SA.xlsx"
xls = pd.ExcelFile(file_path)
df_sa_degs = xls.parse("all-DEGs")

# Extract sets for each category
set_up = set(df_sa_degs[df_sa_degs['category'] == 'Upregulated']['gene_id'])
set_down = set(df_sa_degs[df_sa_degs['category'] == 'Downregulated']['gene_id'])
set_ns = set(df_sa_degs[df_sa_degs['category'] == 'Nonsignificant']['gene_id'])

# Plot Venn Diagram
plt.figure(figsize=(6, 6))
venn3(
    [set_up, set_down, set_ns],
    set_labels=('Upregulated', 'Downregulated', 'Nonsignificant'),
    set_colors=['green', 'salmon', 'blue'],
    alpha=0.6
)
plt.title('Venn Diagram of Gene Categories (S. aureus: SA-E vs SA)')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load Excel File
file_path = "/content/SA-E vs SA.xlsx"
xls = pd.ExcelFile(file_path)
df_sa_degs = xls.parse("all-DEGs")

# Count genes in each category
category_counts = df_sa_degs['category'].value_counts()

# Define pastel colors
colors = {
    'Upregulated': 'lightgreen',
    'Downregulated': 'lightsalmon',
    'Nonsignificant': 'lightblue'
}

# Bar plot
plt.figure(figsize=(6, 5))
category_counts.plot(
    kind='bar',
    color=[colors.get(cat, 'gray') for cat in category_counts.index]
)
plt.title('DEG Category Distribution (S. aureus: SA-E vs SA)')
plt.xlabel('Category')
plt.ylabel('Gene Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

file_path = "/content/SA-E vs SA.xlsx"
xls = pd.ExcelFile(file_path)

# Load all sheets into dictionary
sheets = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}
# Load the KEGG pathway counts sheet
df_kegg = sheets['count-of pathways']

# Show the top entries sorted by DEG count (assuming a relevant column is present)
df_kegg_sorted = df_kegg.sort_values(by=df_kegg.columns[1], ascending=False).head(10)

import matplotlib.pyplot as plt
import seaborn as sns

# Plot top KEGG pathways (assuming first column is pathway name, second is DEG count)
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_kegg_sorted,
    x=df_kegg_sorted.columns[1],
    y=df_kegg_sorted.columns[0],
    palette='pastel'
)
plt.xlabel('Gene Count')
plt.ylabel('KEGG Pathway')
plt.title('Top 10 KEGG Pathways (S. aureus SA_E DEGs)')
plt.tight_layout()
plt.show()

In [None]:
# Map Trinity gene_id to Gene Names (primary) using the Annotated-only sheet

# Create mapping dictionary
gene_name_map = df_annotated.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()

# Apply mapping to replace gene_id with gene name
df_named_degs = df_sa_degs.copy()
df_named_degs['gene_name'] = df_named_degs['gene_id'].map(gene_name_map)

# Drop entries without gene names
df_named_degs = df_named_degs.dropna(subset=['gene_name'])

# Use only Upregulated and Downregulated categories
df_named_filtered = df_named_degs[df_named_degs['category'].isin(['Upregulated', 'Downregulated'])]

# Prepare data
up_genes = df_named_filtered[df_named_filtered['category'] == 'Upregulated']['gene_name'].tolist()
down_genes = df_named_filtered[df_named_filtered['category'] == 'Downregulated']['gene_name'].tolist()

# Combine for plotting
all_genes = up_genes + down_genes
categories = ['Upregulated'] * len(up_genes) + ['Downregulated'] * len(down_genes)

# Assign circular positions
total = len(all_genes)
angles = np.linspace(0, 2 * np.pi, total, endpoint=False)
positions = {gene: (np.cos(a), np.sin(a)) for gene, a in zip(all_genes, angles)}

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect('equal')
ax.axis('off')

# Draw center nodes for categories
ax.plot(0, 0.2, 'o', markersize=15, color='lightgreen')
ax.text(0, 0.27, 'Upregulated', ha='center', fontsize=12)

ax.plot(0, -0.2, 'o', markersize=15, color='lightsalmon')
ax.text(0, -0.27, 'Downregulated', ha='center', fontsize=12)

# Draw gene nodes and connecting lines
for gene, cat in zip(all_genes, categories):
    x, y = positions[gene]
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'
    center_y = 0.2 if cat == 'Upregulated' else -0.2

    ax.plot(x, y, 'o', markersize=6, color=color, alpha=0.8)
    ax.plot([0, x], [center_y, y], color=color, alpha=0.5, linewidth=1)
    ax.text(x, y, gene, fontsize=6, ha='center', va='center')

plt.title("Chord-style Circular Plot of Upregulated and Downregulated Genes (Gene Names)", fontsize=14)
plt.show()


In [None]:
# For a cleaner layout, we'll sort genes by category and arrange them symmetrically

# Combine and sort
df_named_filtered_sorted = df_named_filtered.sort_values(by='category')
sorted_genes = df_named_filtered_sorted['gene_name'].tolist()
sorted_categories = df_named_filtered_sorted['category'].tolist()

# Compute symmetric circular positions
total = len(sorted_genes)
angles = np.linspace(0, 2 * np.pi, total, endpoint=False)
positions = {gene: (np.cos(a), np.sin(a)) for gene, a in zip(sorted_genes, angles)}

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect('equal')
ax.axis('off')

# Category node positions
category_centers = {
    'Upregulated': (0, 0.4),
    'Downregulated': (0, -0.4)
}

# Draw central category nodes
for cat, (x, y) in category_centers.items():
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'
    ax.plot(x, y, 'o', markersize=15, color=color)
    ax.text(x, y + 0.1 if cat == 'Upregulated' else y - 0.1, cat, ha='center', fontsize=12)

# Draw gene nodes and connecting lines
for gene, cat in zip(sorted_genes, sorted_categories):
    x, y = positions[gene]
    cx, cy = category_centers[cat]
    color = 'lightgreen' if cat == 'Upregulated' else 'lightsalmon'

    ax.plot(x, y, 'o', markersize=6, color=color, alpha=0.8)
    ax.plot([cx, x], [cy, y], color=color, alpha=0.6, linewidth=1)
    ax.text(x, y, gene, fontsize=6, ha='center', va='center')

plt.title("Refined Circular Chord-style Plot (Gene Names)", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Create a circular barplot using gene names and category (like user image)

# Assign a numeric value to each gene (e.g., fold change or 1 for visual purposes)
df_named_filtered_sorted['value'] = 1  # uniform bar height

# Use different colors for up and downregulated genes
bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': 'lightgreen',
    'Downregulated': 'lightsalmon'
}).tolist()

# Plot
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)
ax.set_title("Circular Barplot: Upregulated and Downregulated Genes (Pastel Colors)", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Use pastel pink for downregulated and pastel green for upregulated genes
pastel_bar_colors = df_named_filtered_sorted['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}).tolist()

# Redraw the circular barplot with updated colors
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(df_named_filtered_sorted), endpoint=False)
radii = df_named_filtered_sorted['value'].tolist()
bars = ax.bar(theta, radii, width=0.04, color=pastel_bar_colors, edgecolor='black')

# Add gene name labels around circle
for i, (angle, label) in enumerate(zip(theta, df_named_filtered_sorted['gene_name'])):
    rotation = np.rad2deg(angle)
    alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
    ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
            ha=alignment, va='center', fontsize=6)

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Filter out low-expression genes by setting a threshold on 'logFC' or another value
threshold = 0.1  # absolute logFC cutoff
filtered_df = df_named_filtered_sorted[df_named_filtered_sorted['logFC'].abs() > threshold]

# Keep only relevant columns and assign value 1 for bar height
filtered_df['value'] = 1  # uniform height
pastel_bar_colors = filtered_df['category'].map({
    'Upregulated': '#A8E6CF',
    'Downregulated': '#FFB3BA'
}).tolist()

# Prepare polar coordinates
theta = np.linspace(0, 2 * np.pi, len(filtered_df), endpoint=False)
radii = filtered_df['value'].tolist()

# Draw circular barplot
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'projection': 'polar'})
bars = ax.bar(theta, radii, width=0.04, color=pastel_bar_colors, edgecolor='black')

# Label every 3rd gene for spacing
for i, (angle, label) in enumerate(zip(theta, filtered_df['gene_name'])):
    if i % 3 == 0:
        rotation = np.rad2deg(angle)
        alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
        ax.text(angle, 1.15, label, rotation=rotation, rotation_mode='anchor',
                ha=alignment, va='center', fontsize=5)

# Clean up formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.2)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Filter genes by logFC threshold
threshold = 0.3
filtered_df = df_named_filtered_sorted[df_named_filtered_sorted['logFC'].abs() > threshold].copy()
filtered_df['value'] = 1

# Bar colors
pastel_bar_colors = filtered_df['category'].map({
    'Upregulated': '#A8E6CF',
    'Downregulated': '#FFB3BA'
}).tolist()

# Set circular coordinates
theta = np.linspace(0, 2 * np.pi, len(filtered_df), endpoint=False)
radii = filtered_df['value'].tolist()

# Create circular barplot
fig, ax = plt.subplots(figsize=(14, 14), subplot_kw={'projection': 'polar'})
bars = ax.bar(theta, radii, width=0.045, color=pastel_bar_colors, edgecolor='black')

# Label spacing and font styling
for i, (angle, label) in enumerate(zip(theta, filtered_df['gene_name'])):
    if i % 2 == 0:  # show every 2nd gene name
        rotation = np.rad2deg(angle)
        alignment = 'left' if np.pi/2 < angle < 3*np.pi/2 else 'right'
        rotation = rotation + 180 if alignment == 'left' else rotation
        ax.text(angle, 1.18, label,
                rotation=rotation,
                rotation_mode='anchor',
                ha=alignment,
                va='center',
                fontsize=6,
                fontweight='bold')  # Improve sharpness

# Formatting
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_ylim(0, 1.25)
plt.tight_layout()
plt.show()


In [None]:
# Step 1: Filter annotated genes for S. aureus only
sa_annotated = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]

# Step 2: Map gene_id to primary gene name for S. aureus only
sa_gene_name_map = sa_annotated.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()

# Step 3: Filter All-DEGs to keep only S. aureus gene IDs
df_sa_only = sheets['all-DEGs'][sheets['all-DEGs']['gene_id'].isin(sa_gene_name_map.keys())].copy()

# Step 4: Add gene name column
df_sa_only['gene_name'] = df_sa_only['gene_id'].map(sa_gene_name_map)

# Step 5: Keep only rows with gene names and relevant categories
df_sa_only_filtered = df_sa_only[
    df_sa_only['category'].isin(['Upregulated', 'Downregulated'])
].dropna(subset=['gene_name'])

# Step 6: Prepare for heatmap
df_sa_only_filtered['value'] = 1  # for barplot if needed
heatmap_data = df_sa_only_filtered[['gene_name', 'logFC', 'category']].copy()
heatmap_data = heatmap_data.set_index('gene_name')
heatmap_data = heatmap_data.sort_values(by=['category', 'logFC'])

# Create color-mapped heatmap
row_colors = heatmap_data['category'].map({
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
})
heatmap_values = heatmap_data.drop(columns='category')

# Plot
plt.figure(figsize=(6, 14))
sns.heatmap(
    heatmap_values,
    cmap='vlag',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus Differentially Expressed Genes (log2FC)")
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()


In [None]:
# Plot with a blue-green-yellow colormap
plt.figure(figsize=(8, 40))
sns.heatmap(
    heatmap_values,
    cmap='YlGnBu',  # Blue-Green-Yellow hue
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'log2 Fold Change'},
    yticklabels=True
)
plt.title("Heatmap of S. aureus DEGs (Blue-Green-Yellow Color Scale)", fontsize=14)
plt.xlabel("Condition")
plt.ylabel("Gene Name")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Extract expression values for PCA from 'All-DEGs' for S. aureus only
# Using original counts or transformed values (SA and SA-C columns)
df_sa_pca = df_sa_only[['gene_id', 'SA', 'SA-E']].copy()
df_sa_pca = df_sa_pca.dropna()

# Rename columns for clarity
df_sa_pca.columns = ['gene_id', 'SA', 'SA_E']

# Normalize the data
X = df_sa_pca[['SA', 'SA_E']]
X_scaled = StandardScaler().fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

# Create DataFrame with PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['gene_id'] = df_sa_pca['gene_id'].values

# Plot PCA
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', alpha=0.7)
plt.title('PCA of S. aureus Gene Expression (SA vs SA-E)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Merge category information into PCA dataframe
gene_category_map = df_sa_only_filtered.set_index('gene_id')['category'].to_dict()
pca_df['category'] = pca_df['gene_id'].map(gene_category_map)

# Plot PCA with color by DEG category
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='category', palette={
    'Upregulated': '#A8E6CF',     # pastel green
    'Downregulated': '#FFB3BA'    # pastel pink
}, alpha=0.8)

plt.title('PCA of S. aureus Gene Expression (Colored by DEG Category)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.legend(title='Category')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Install required library
# !pip install umap-learn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import umap

# Load your Excel file
file_path = "/content/SA-E vs SA.xlsx"
xls = pd.ExcelFile(file_path)

df_all_degs = xls.parse('all-DEGs')
df_annotated = xls.parse('Annotated-only')

# Filter for Staphylococcus aureus genes
df_annotated_sa = df_annotated[df_annotated['Organism'].str.contains('Staphylococcus aureus', na=False)]
gene_map = df_annotated_sa.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()
df_sa_only = df_all_degs[df_all_degs['gene_id'].isin(gene_map.keys())].copy()
df_sa_only['gene_name'] = df_sa_only['gene_id'].map(gene_map)

# Only up and down regulated genes
df_sa_filtered = df_sa_only[df_sa_only['category'].isin(['Upregulated', 'Downregulated'])]
df_sa_filtered = df_sa_filtered.dropna(subset=['gene_name'])

# Prepare data for UMAP
X = df_sa_filtered[['SA', 'SA-E']]
X_scaled = StandardScaler().fit_transform(X)

# Run UMAP
umap_model = umap.UMAP(random_state=42)
umap_result = umap_model.fit_transform(X_scaled)

# Prepare UMAP plot dataframe
df_umap = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
df_umap['gene_name'] = df_sa_filtered['gene_name'].values
df_umap['category'] = df_sa_filtered['category'].values

# Plot UMAP
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_umap,
    x='UMAP1',
    y='UMAP2',
    hue='category',
    palette={
        'Upregulated': '#A8E6CF',
        'Downregulated': '#FFB3BA'
    },
    style='category',
    markers=['o', 'o'],
    s=50,
    alpha=0.8
)
plt.title("UMAP of S. aureus Gene Expression")
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.legend(title='Category')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# !pip install plotly pandas openpyxl

import plotly.graph_objects as go
import pandas as pd

# Load Excel file
xls = pd.ExcelFile("/content/SA-E vs SA.xlsx")
df_degs = xls.parse("all-DEGs")
df_annot = xls.parse("Annotated-only")

# Filter for S. aureus genes
df_annot_sa = df_annot[df_annot['Organism'].str.contains('Staphylococcus aureus', na=False)]
gene_map = df_annot_sa.set_index('gene_id')['Gene Names (primary)'].dropna().to_dict()
df_sa = df_degs[df_degs['gene_id'].isin(gene_map.keys())].copy()
df_sa['gene_name'] = df_sa['gene_id'].map(gene_map)

# Keep only Up/Down regulated
df_sankey = df_sa[df_sa['category'].isin(['Upregulated', 'Downregulated'])]
df_sankey = df_sankey.dropna(subset=['gene_name'])

# Take top 20 genes by absolute logFC
top_genes = df_sankey.reindex(df_sankey['logFC'].abs().sort_values(ascending=False).index).head(20)
gene_labels = top_genes['gene_name'].tolist()
categories = top_genes['category'].tolist()

# Build Sankey structure
nodes = list(set(gene_labels + categories))
node_indices = {node: i for i, node in enumerate(nodes)}
sources = [node_indices[gene] for gene in gene_labels]
targets = [node_indices[cat] for cat in categories]
values = [1] * len(gene_labels)

# Define pastel colors
node_colors = []
for node in nodes:
    if node == 'Upregulated':
        node_colors.append('#A8E6CF')  # pastel green
    elif node == 'Downregulated':
        node_colors.append('#FFB3BA')  # pastel pink
    else:
        node_colors.append('lightgrey')  # gene nodes

# Sankey plot
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=15,
        line=dict(color="black", width=0.5),
        label=nodes,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color="rgba(160,160,160,0.4)"
    )
)])

fig.update_layout(title_text="Sankey Diagram: Gene Expression Categories", font_size=12)
fig.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Excel file
file_path = "/content/SA-P vs SA.xlsx"  # Update path as needed
xls = pd.ExcelFile(file_path)

# Load annotated and DEG sheets
df_annotated = xls.parse("Annotated-only")
df_degs = xls.parse("All-DEGs")

# Filter annotated data for Staphylococcus aureus
df_sa_annotated = df_annotated[df_annotated['Organism'].str.contains("Staphylococcus aureus", na=False)]

# Merge with DEG data
df_merged = pd.merge(df_sa_annotated, df_degs, on='gene_id')

# Filter for downregulated genes only
df_down = df_merged[df_merged['category_x'] == 'Downregulated']

# Select top 20 most downregulated genes (lowest logFC)
df_down_top = df_down.sort_values(by='logFC_x').head(100)

# Prepare heatmap data using 'SA' and 'SA-P' expression values
heatmap_data = df_down_top[['SA_x', 'SA-P_x']]
heatmap_data.index = df_down_top['Gene Names (primary)']

# Normalize (log2-transform)
heatmap_data = np.log2(heatmap_data + 1)

# Plot heatmap
plt.figure(figsize=(8, 10))
sns.heatmap(heatmap_data, annot=False, cmap='YlGnBu', linewidths=0.5)
plt.title('Top Downregulated Genes in S. aureus (log2-transformed expression)')
plt.xlabel('Condition')
plt.ylabel('Gene Name (Primary)')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Excel file
file_path = "/content/SA-C vs SA.xlsx"  # Update path as needed
xls = pd.ExcelFile(file_path)

# Load annotated and DEG sheets
df_annotated = xls.parse("Annotated-only")
df_degs = xls.parse("All-DEGs")

# Filter annotated data for Staphylococcus aureus
df_sa_annotated = df_annotated[df_annotated['Organism'].str.contains("Staphylococcus aureus", na=False)]

# Merge with DEG data
df_merged = pd.merge(df_sa_annotated, df_degs, on='gene_id')

# Filter for upregulated and downregulated genes
df_down = df_merged[df_merged['category_x'] == 'Downregulated']
df_up = df_merged[df_merged['category_x'] == 'Upregulated']

# Select top 50 most downregulated and top 50 most upregulated genes
df_down_top = df_down.sort_values(by='logFC_x').head(50)
df_up_top = df_up.sort_values(by='logFC_x', ascending=False).head(50)

# Combine up and downregulated genes
df_combined = pd.concat([df_down_top, df_up_top])

# Prepare heatmap data using 'SA' and 'SA-C' expression values
heatmap_data = df_combined[['SA_x', 'SA-C_x']]
heatmap_data.index = df_combined['Gene Names (primary)']

# Normalize (log2-transform)
heatmap_data = np.log2(heatmap_data + 1)

# Plot heatmap
plt.figure(figsize=(10, 14))
sns.heatmap(heatmap_data, annot=False, cmap='YlGnBu', linewidths=0.5)
plt.title('Top 50 Up & Downregulated Genes in S. aureus (log2-transformed expression)')
plt.xlabel('Condition')
plt.ylabel('Gene Name (Primary)')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Excel file
file_path = "/content/SA-E vs SA.xlsx"  # Update path as needed
xls = pd.ExcelFile(file_path)

# Load annotated and DEG sheets
df_annotated = xls.parse("Annotated-only")
df_degs = xls.parse("all-DEGs")

# Filter annotated data for Staphylococcus aureus
df_sa_annotated = df_annotated[df_annotated['Organism'].str.contains("Staphylococcus aureus", na=False)]

# Merge with DEG data
df_merged = pd.merge(df_sa_annotated, df_degs, on='gene_id')

# Filter for upregulated and downregulated genes
df_down = df_merged[df_merged['category_x'] == 'Downregulated']
df_up = df_merged[df_merged['category_x'] == 'Upregulated']

# Select top 50 most downregulated and top 50 most upregulated genes
df_down_top = df_down.sort_values(by='logFC_x').head(50)
df_up_top = df_up.sort_values(by='logFC_x', ascending=False).head(50)

# Combine up and downregulated genes
df_combined = pd.concat([df_down_top, df_up_top])

# Prepare heatmap data using 'SA' and 'SA-C' expression values
heatmap_data = df_combined[['SA_x', 'SA-E_x']]
heatmap_data.index = df_combined['Gene Names (primary)']

# Normalize (log2-transform)
heatmap_data = np.log2(heatmap_data + 1)

# Plot heatmap
plt.figure(figsize=(10, 14))
sns.heatmap(heatmap_data, annot=False, cmap='YlGnBu', linewidths=0.5)
plt.title('Top 50 Up & Downregulated Genes in S. aureus (log2-transformed expression)')
plt.xlabel('Condition')
plt.ylabel('Gene Name (Primary)')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Excel file
file_path = "/content/SA-E vs SA.xlsx"  # <-- Update path if needed
xls = pd.ExcelFile(file_path)

# Load the annotated sheet and filter for Staphylococcus aureus
df_annotated = xls.parse("Annotated-only")
df_staph = df_annotated[df_annotated['Organism'].str.contains("Staphylococcus aureus", na=False)]

# Load the GO term count sheet
df_go = xls.parse("count of GO-terms")

# Rename columns for consistency
df_go.columns = ['GO Term', 'Count', 'Category']

# Get top 5 terms for each GO category
df_bp = df_go[df_go['Category'] == 'BP'].sort_values(by='Count', ascending=False).head(5)
df_mf = df_go[df_go['Category'] == 'MF'].sort_values(by='Count', ascending=False).head(5)
df_cc = df_go[df_go['Category'] == 'CC'].sort_values(by='Count', ascending=False).head(5)

# Add category labels again (in case of overwrite)
df_bp['Category'] = 'BP'
df_mf['Category'] = 'MF'
df_cc['Category'] = 'CC'

# Combine all into one dataframe
df_go_top5_combined = pd.concat([df_bp, df_mf, df_cc], ignore_index=True)

# Plot the top 5 GO terms per category
plt.figure(figsize=(12, 7))
sns.barplot(
    data=df_go_top5_combined,
    x='Count',
    y='GO Term',
    hue='Category',
    palette='Set2'
)
plt.xlabel('Gene Count')
plt.ylabel('GO Term')
plt.title('Top 5 GO Terms by Category (BP, MF, CC) - SA-E vs SA')
plt.legend(title='GO Category')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Excel file
file_path = "SA-C vs SA.xlsx"  # Update if path changes
xls = pd.ExcelFile(file_path)

# Load the annotated sheet and filter for Staphylococcus aureus
df_annotated = xls.parse("Annotated-only")
df_staph = df_annotated[df_annotated['Organism'].str.contains("Staphylococcus aureus", na=False)]

# Load the GO term count sheet
df_go = xls.parse("Count-of GO-terms")

# Rename columns for consistency
df_go.columns = ['GO Term', 'Count', 'Category']

# Get top 5 terms for each GO category
df_bp = df_go[df_go['Category'] == 'BP'].sort_values(by='Count', ascending=False).head(5)
df_mf = df_go[df_go['Category'] == 'MF'].sort_values(by='Count', ascending=False).head(5)
df_cc = df_go[df_go['Category'] == 'CC'].sort_values(by='Count', ascending=False).head(5)

# Add category labels again
df_bp['Category'] = 'BP'
df_mf['Category'] = 'MF'
df_cc['Category'] = 'CC'

# Combine into one dataframe
df_go_top5_combined = pd.concat([df_bp, df_mf, df_cc], ignore_index=True)

# Plot the GO terms
plt.figure(figsize=(12, 7))
sns.barplot(
    data=df_go_top5_combined,
    x='Count',
    y='GO Term',
    hue='Category',
    palette='Set2'
)
plt.xlabel('Gene Count')
plt.ylabel('GO Term')
plt.title('Top 5 GO Terms by Category (BP, MF, CC) - SA-C vs SA')
plt.legend(title='GO Category')
plt.tight_layout()
plt.show()
