In [None]:
import pandas as pd
from sklearn.cluster import KMeans, Birch
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

In [None]:
# Load the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

In [None]:
data

In [None]:
# Extract features (assuming the columns you mentioned are features for clustering)
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

In [None]:
# Remove rows with any missing values
features = features.dropna()



In [None]:
# Standardize the features to improve clustering performance
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

# Define unique markers for each feature
markers = ['o', 's', '^', 'D', 'v', '*']  # Different symbols for each feature
colors = ['r', 'g', 'b', 'c', 'm', 'y']  # Colors for better visualization

# Identify the dominant feature for each sample
dominant_feature = np.argmax(features.values, axis=1)  # Index of max feature per row

# Create the plot
plt.figure(figsize=(8, 6))

for i, feature in enumerate(features.columns):
    # Select points where the i-th feature is the most dominant
    mask = dominant_feature == i
    plt.scatter(pca_features[mask, 0], pca_features[mask, 1], marker=markers[i], color=colors[i], label=feature, alpha=0.6)

# Add labels and title
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Visualization with Feature-Based Symbols')
plt.legend()
plt.show()


In [None]:
pip install umap-learn


In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt

# Apply UMAP for dimensionality reduction
umap_model = umap.UMAP(n_components=2, random_state=42)
umap_features = umap_model.fit_transform(scaled_features)

# Define unique markers and colors for each feature
markers = ['o', 's', '^', 'D', 'v', '*']  # Symbols for each feature
colors = ['r', 'g', 'b', 'c', 'm', 'y']  # Colors for visualization

# Identify the dominant feature for each sample
dominant_feature = np.argmax(features.values, axis=1)  # Index of max feature per row

# Create the plot
plt.figure(figsize=(8, 6))

for i, feature in enumerate(features.columns):
    # Select points where the i-th feature is the most dominant
    mask = dominant_feature == i
    plt.scatter(umap_features[mask, 0], umap_features[mask, 1],
                marker=markers[i], color=colors[i],
                label=feature, alpha=0.7)

# Add labels and title
plt.xlabel('UMAP Component 1', fontsize=14)
plt.ylabel('UMAP Component 2', fontsize=14)
plt.title('UMAP Visualization of Features', fontsize=16)
plt.legend(fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.show()  # No grid for a publication-ready plot


In [None]:
import numpy as np
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

# Compute the linkage matrix
linkage_matrix = sch.linkage(scaled_features, method='ward')  # 'ward' minimizes variance

# Create the figure
plt.figure(figsize=(10, 6))

# Plot the dendrogram
sch.dendrogram(linkage_matrix, labels=features.index, leaf_rotation=90, leaf_font_size=10, color_threshold=5)

# Add labels
plt.xlabel('Samples', fontsize=14)
plt.ylabel('Cluster Distance', fontsize=14)
plt.title('Hierarchical Clustering Dendrogram', fontsize=16)

plt.show()


In [None]:
# --- KMeans Clustering ---
kmeans = KMeans(n_clusters=3, random_state=42)  # You can change the number of clusters
kmeans_labels = kmeans.fit_predict(scaled_features)

In [None]:
# Add the KMeans cluster labels to the original data
# Create a new column filled with a placeholder (e.g., -1)
data['KMeans_Cluster'] = -1

# Assign cluster labels to the rows that were used in clustering
data.loc[features.index, 'KMeans_Cluster'] = kmeans_labels

In [None]:
# Check the esxA distribution across clusters
print(data.groupby('KMeans_Cluster')['EsxA'].value_counts())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.boxplot(x=data['KMeans_Cluster'], y=data['EsxA'], data=data)
plt.xlabel('Cluster')
plt.ylabel('EsxA Expression')
plt.title('Distribution of EsxA Expression Across Clusters')
plt.show()


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_features = pca.fit_transform(features)  # 'features' includes esxA

# Get cluster labels for the subset of data used for PCA
cluster_labels_for_pca = data.loc[features.index, 'KMeans_Cluster']

plt.figure(figsize=(8, 6))
# Use cluster_labels_for_pca instead of data['KMeans_Cluster']
plt.scatter(pca_features[:, 0], pca_features[:, 1], c=cluster_labels_for_pca, cmap='viridis', edgecolors='k')

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering with PCA')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# ... (rest of your code) ...

# Plot K-Means clustering with PCA
print("Explained Variance Ratios:", pca.explained_variance_ratio_)
plt.figure(figsize=(8, 6))
plt.scatter(pca_features[:, 0], pca_features[:, 1], c=cluster_labels_for_pca, cmap='viridis', edgecolors='k', alpha=0.7, label='Clusters')

# Mark each feature direction in PCA space
feature_vectors = pca.components_.T  # Get the principal components
feature_names = ['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']
# Use valid marker styles instead of single letters
markers = ['o', 's', '^', 'd', 'v', 'x']  # Example: 'o' for circle, 's' for square, etc.

for i, (feature, marker) in enumerate(zip(feature_names, markers)):
    plt.scatter(feature_vectors[i, 0] * max(pca_features[:, 0]),
                feature_vectors[i, 1] * max(pca_features[:, 1]),
                marker=marker, s=100, label=feature, edgecolors='black')

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering with PCA & Feature Marking')
plt.colorbar(label='Cluster')
plt.legend()
plt.show()

In [None]:
# Save the plot with high quality
plt.savefig('pca_kmeans_clustering.png', dpi=300,)

In [None]:
from sklearn.manifold import TSNE
!pip install umap-learn  # Install umap-learn
# Instead of:
# import umap.umap_ as umap
import umap # Import umap instead of umap.umap_
import matplotlib.pyplot as plt

# --- t-SNE ---
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_features = tsne.fit_transform(scaled_features)

plt.figure(figsize=(8, 6))
plt.scatter(tsne_features[:, 0], tsne_features[:, 1], c=cluster_labels_for_pca, cmap='viridis', edgecolors='k')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Clustering Visualization')
plt.colorbar(label='Cluster')
plt.show()



In [None]:
# --- t-SNE ---
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_features = tsne.fit_transform(scaled_features)

plt.figure(figsize=(8, 6))
plt.scatter(tsne_features[:, 0], tsne_features[:, 1], c=cluster_labels_for_pca, cmap='viridis', edgecolors='k', alpha=0.7, label='Clusters')

# Mark each feature direction in t-SNE space
for i, (feature, marker) in enumerate(zip(feature_names, markers)):
    plt.scatter(np.mean(tsne_features[:, 0]) + i * 0.5, np.mean(tsne_features[:, 1]) + i * 0.5,
                marker=marker, s=100, label=feature, edgecolors='black')

plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Clustering Visualization with Feature Marking')
plt.colorbar(label='Cluster')
plt.legend()
plt.show()

In [None]:
# --- UMAP ---
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_features = umap_reducer.fit_transform(scaled_features)

plt.figure(figsize=(8, 6))
plt.scatter(umap_features[:, 0], umap_features[:, 1], c=cluster_labels_for_pca, cmap='viridis', edgecolors='k')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('UMAP Clustering Visualization')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
# --- UMAP ---
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_features = umap_reducer.fit_transform(scaled_features)

plt.figure(figsize=(8, 6))
plt.scatter(umap_features[:, 0], umap_features[:, 1], c=cluster_labels_for_pca, cmap='viridis', edgecolors='k', alpha=0.7, label='Clusters')

# Mark each feature direction in UMAP space
for i, (feature, marker) in enumerate(zip(feature_names, markers)):
    plt.scatter(np.mean(umap_features[:, 0]) + i * 0.5, np.mean(umap_features[:, 1]) + i * 0.5,
                marker=marker, s=100, label=feature, edgecolors='black')

plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('UMAP Clustering Visualization with Feature Marking')
plt.colorbar(label='Cluster')
plt.legend()
plt.show()

In [None]:
# Line graph to visualize EsxA expression levels across clusters

plt.figure(figsize=(8, 6))
sns.lineplot(x=data['KMeans_Cluster'], y=data['EsxA'], marker="o", label="EsxA Expression")

plt.xlabel('Cluster')
plt.ylabel('EsxA Expression')
plt.title('Trend of EsxA Expression Across Clusters')
plt.xticks(ticks=sorted(data['KMeans_Cluster'].unique()))  # Ensure proper x-axis ticks
plt.grid(True)
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))

# Plot each sample as a separate line
for sample in data.index:
    plt.plot(['Cluster ' + str(c) for c in [data.loc[sample, 'KMeans_Cluster']]],
             [data.loc[sample, 'EsxA']],
             marker="o", linestyle="-", alpha=0.7)

plt.xlabel('Cluster')
plt.ylabel('EsxA Expression')
plt.title('EsxA Expression Trends for All Samples Across Clusters')
plt.grid(True)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))

# Get unique clusters and sort them
unique_clusters = sorted(data['KMeans_Cluster'].unique())

# Plot each sample separately
for sample in data.index:
    cluster = data.loc[sample, 'KMeans_Cluster']
    esxA_value = data.loc[sample, 'EsxA']

    # Use cluster as x-axis and EsxA as y-axis
    plt.plot(cluster, esxA_value, marker="o", linestyle="-", alpha=0.7, label=f"Sample {sample}")

plt.xlabel('Cluster')
plt.ylabel('EsxA Expression')
plt.title('EsxA Expression Trends for All Samples Across Clusters')
plt.xticks(ticks=unique_clusters)  # Set proper x-axis ticks
plt.legend(title="Samples", bbox_to_anchor=(1.05, 1), loc='upper left', ncol=2)  # Add legend outside plot
plt.grid(True)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))

# Sort data by clusters for correct plotting
sorted_data = data.sort_values(by='KMeans_Cluster')

# Plot each sample as a separate line across clusters
for sample in sorted_data.index:
    plt.plot(sorted_data['KMeans_Cluster'], sorted_data['EsxA'], marker="o", linestyle="-", alpha=0.7)

plt.xlabel('Cluster')
plt.ylabel('EsxA Expression')
plt.title('EsxA Expression Trends for All Samples Across Clusters')
plt.xticks(sorted(data['KMeans_Cluster'].unique()))  # Ensure correct x-axis labels
plt.grid(True)

plt.show()


In [None]:
import seaborn as sns

data['KMeans_Cluster'] = data['KMeans_Cluster'].astype(str)  # Convert to string for colors
sns.pairplot(data, hue='KMeans_Cluster', diag_kind='kde', palette='viridis')
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(12, 12))
sns.heatmap(pca_features, annot=True)
plt.title("Kmeans Cluster Prediction", fontsize =10)

plt.figure(figsize=(12, 12))
sns.heatmap(pca_features, annot=True)
plt.title("Hierarchical Clustering Cluster Prediction", fontsize =10)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
# Convert 'KMeans_Cluster' back to numeric for color mapping
plt.scatter(data['Lipase'], data['EsxA'], c=data['KMeans_Cluster'].astype(int), cmap='viridis', edgecolors='k')

plt.xlabel('Lipase')
plt.ylabel('EsxA Expression')
plt.title('K-Means Clustering: Lipase vs EsxA')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
# Convert 'KMeans_Cluster' back to numeric for color mapping
plt.scatter(data['Protease'], data['EsxA'], c=data['KMeans_Cluster'].astype(int), cmap='viridis', edgecolors='k')

plt.xlabel('Protease')
plt.ylabel('EsxA Expression')
plt.title('K-Means Clustering: Protease vs EsxA')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
# Convert 'KMeans_Cluster' back to numeric for color mapping
plt.scatter(data['Hemolysin'], data['EsxA'], c=data['KMeans_Cluster'].astype(int), cmap='viridis', edgecolors='k')

plt.xlabel('Hemolysin')
plt.ylabel('EsxA Expression')
plt.title('K-Means Clustering: Hemolysin vs EsxA')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
# Convert 'KMeans_Cluster' back to numeric for color mapping
plt.scatter(data['DNase'], data['EsxA'], c=data['KMeans_Cluster'].astype(int), cmap='viridis', edgecolors='k')

plt.xlabel('DNase')
plt.ylabel('EsxA Expression')
plt.title('K-Means Clustering: DNase vs EsxA')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
# Convert 'KMeans_Cluster' back to numeric for color mapping
plt.scatter(data['Staphyloxanthin'], data['EsxA'], c=data['KMeans_Cluster'].astype(int), cmap='viridis', edgecolors='k')

plt.xlabel('Staphyloxanthin')
plt.ylabel('EsxA Expression')
plt.title('K-Means Clustering: Staphyloxanthin vs EsxA')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
# --- Hierarchical Clustering ---
linked = linkage(scaled_features, method='ward')  # Using 'ward' linkage method
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

In [None]:
# BIRCH Clustering
birch = Birch(n_clusters=3)  # You can change the number of clusters
birch_labels = birch.fit_predict(scaled_features)

# Create a new column in 'data' and fill with a placeholder (e.g., -1)
data['BIRCH_Cluster'] = -1

# Assign cluster labels to the rows that were used in clustering
data.loc[features.index, 'BIRCH_Cluster'] = birch_labels # Assign to the same rows used for features

# --- View the final data with clusters ---
print(data.head())

# Optionally, save the data with clusters to a new CSV
data.to_csv('clustered_data.csv', index=False)

In [None]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer  # Import the imputer

# Selecting all features for clustering
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Impute missing values (e.g., using the mean)
imputer = SimpleImputer(strategy='mean')  # Create an imputer instance
features_imputed = imputer.fit_transform(features)  # Impute missing values

# Reduce dimensionality to 2D
pca = PCA(n_components=2)
pca_features = pca.fit_transform(features_imputed)  # Use imputed features

# Scatter plot of PCA components
plt.figure(figsize=(8, 6))
plt.scatter(pca_features[:, 0], pca_features[:, 1], c=data['BIRCH_Cluster'], cmap='viridis', edgecolors='k')

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Visualization of BIRCH Clusters')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
import seaborn as sns

data['BIRCH_Cluster'] = data['BIRCH_Cluster'].astype(str)  # Convert to string for color coding

sns.pairplot(data, hue='BIRCH_Cluster', diag_kind='kde', palette='viridis')
plt.show()


###############

In [None]:
pip install pandas seaborn matplotlib


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Separate Staphyloxanthin from other features
numerical_data = data.drop(columns=['Staphyloxanthin'])
staphyloxanthin_scores = data[['Staphyloxanthin']]

# Create heatmap for numerical data
plt.figure(figsize=(10, 8))
sns.heatmap(numerical_data, cmap='YlGnBu', linewidths=0.5, fmt='.2f', cbar=True)
plt.title('Heatmap of Virulence Factors')
plt.show()

# Create a separate heatmap for Staphyloxanthin scores
plt.figure(figsize=(10, 1))  # A smaller figure for categorical data
sns.heatmap(staphyloxanthin_scores.T, cmap='viridis', cbar=False, fmt='d')
plt.title('Staphyloxanthin Scores')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage
from sklearn.impute import SimpleImputer  # Import SimpleImputer

# Read the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Separate Staphyloxanthin from other features
numerical_data = data.drop(columns=['Staphyloxanthin'])  # Remove categorical/non-numerical column
staphyloxanthin_scores = data[['Staphyloxanthin']]  # Store separately if needed

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Replace NaN with the mean of the column
numerical_data_imputed = imputer.fit_transform(numerical_data)

# Convert the imputed data back to a DataFrame
numerical_data_imputed = pd.DataFrame(numerical_data_imputed, columns=numerical_data.columns)


# Perform hierarchical clustering using 'ward' method
row_linkage = linkage(numerical_data_imputed, method='ward')  # Cluster samples using imputed data
col_linkage = linkage(numerical_data_imputed.T, method='ward')  # Cluster features using imputed data

# Create a clustered heatmap
g = sns.clustermap(numerical_data_imputed, cmap='YlGnBu', linewidths=0.5,
                   row_linkage=row_linkage, col_linkage=col_linkage,
                   figsize=(10, 8), cbar=True, fmt='.2f')

plt.title('Hierarchical Clustering Heatmap', fontsize=14)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage
from sklearn.impute import SimpleImputer

# Read the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Separate Staphyloxanthin from other features
numerical_data = data.drop(columns=['Staphyloxanthin'])  # Remove categorical/non-numerical column
staphyloxanthin_scores = data[['Staphyloxanthin']]  # Store separately if needed

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Replace NaN with the mean of the column
numerical_data_imputed = imputer.fit_transform(numerical_data)

# Convert the imputed data back to a DataFrame
numerical_data_imputed = pd.DataFrame(numerical_data_imputed, columns=numerical_data.columns, index=numerical_data.index)

# Perform hierarchical clustering using 'ward' method
row_linkage = linkage(numerical_data_imputed, method='ward')
col_linkage = linkage(numerical_data_imputed.T, method='ward')

# Create a clustered heatmap with all labels visible
g = sns.clustermap(numerical_data_imputed, cmap='YlGnBu', linewidths=0.5,
                   row_linkage=row_linkage, col_linkage=col_linkage,
                   figsize=(12, 10), cbar=True, annot=False, fmt='.2f',
                   dendrogram_ratio=(0.1, 0.2),  # Adjust dendrogram size for better visibility
                   xticklabels=True, yticklabels=True)  # Ensure all labels are displayed

plt.title('Hierarchical Clustering Heatmap', fontsize=14)
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer # Import SimpleImputer

# Load your CSV file
df = pd.read_csv('/content/Clustering.csv', header=0)  # Replace with your actual CSV file

# Assuming you have already performed clustering and assigned labels
# If you used K-Means clustering:

# Choose the number of clusters based on your previous results
n_clusters = 3  # Adjust this based on your best clustering result

# Create an imputer to fill NaN values with the mean of each column
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on your data (excluding the first column if it's a header)
data_imputed = imputer.fit_transform(df.iloc[:, 1:])

# Initialize KMeans with the imputed data
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["Cluster"] = kmeans.fit_predict(data_imputed)  # Use imputed data for clustering

# Melt data for violin plot
df_melted = df.melt(id_vars=["Cluster"], var_name="Feature", value_name="Value")

# Plot violin plots for all features
plt.figure(figsize=(12, 6))
sns.violinplot(x="Feature", y="Value", hue="Cluster", data=df_melted, palette="Set2")
plt.xticks(rotation=45)
plt.title("Violin Plot of Features Across Clusters")
plt.legend(title="Cluster")
plt.show()

##########################

In [None]:
from sklearn.ensemble import RandomForestRegressor

X = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
y = data['EsxA']

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Feature Importance Plot
importances = rf.feature_importances_
feature_names = X.columns

plt.figure(figsize=(8, 5))
sns.barplot(x=importances, y=feature_names, palette='viridis')
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance using Random Forest")
plt.show()


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

# Load your CSV file
df = pd.read_csv('/content/Clustering.csv',header=0)

# Convert data to long format (Assuming values indicate connections)
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = df.iloc[:, i].sum() + df.iloc[:, j].sum()  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Convert to Chord format
chord_data = hv.Chord(data)

# Styling the Chord Diagram
chord_data.opts(opts.Chord(cmap='Category10', edge_color='source', labels='name', node_color='index'))

# Show the plot
hv.output(chord_data)


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
from bokeh.models import ColorBar, LinearColorMapper
hv.extension('bokeh')

# Load CSV file
df = pd.read_csv('/content/Clustering.csv',header=0)

# Convert data to long format for Chord Diagram
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = df.iloc[:, i].sum() + df.iloc[:, j].sum()  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Convert to Chord format
chord_data = hv.Chord(data)

# Define color mapping
color_mapper = LinearColorMapper(palette='Viridis256', low=min([d[2] for d in data]), high=max([d[2] for d in data]))

# Styling the Chord Diagram
chord_data.opts(
    opts.Chord(
        cmap='Category10',      # Node colors
        edge_color='value',     # Color edges based on interaction strength
        labels='name',          # Add labels to nodes
        node_color='index',     # Color nodes differently
        line_width=2,           # Increase line thickness
        edge_alpha=0.8,         # Edge transparency
        width=800, height=800   # Adjust figure size
    )
)

# Show the plot
hv.output(chord_data)


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
from bokeh.models import ColorBar, LinearColorMapper
hv.extension('bokeh')

# Load CSV file
df = pd.read_csv('/content/Clustering.csv',header=0)

# Convert data to long format for Chord Diagram
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = df.iloc[:, i].sum() + df.iloc[:, j].sum()  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Convert to Chord format
chord_data = hv.Chord(data)

# Define color mapping
color_mapper = LinearColorMapper(palette='Viridis256', low=min([d[2] for d in data]), high=max([d[2] for d in data]))

# Styling the Chord Diagram
chord_data.opts(
    opts.Chord(
        cmap='Category10',      # Node colors
        edge_color='value',     # Color edges based on interaction strength
        labels='name',          # Add labels to nodes
        node_color='index',     # Color nodes differently
        edge_line_width=2,      # Changed line_width to edge_line_width to control edge thickness
        edge_alpha=0.8,         # Edge transparency
        width=800, height=800   # Adjust figure size
    )
)

# Show the plot
hv.output(chord_data)

In [None]:
pip install pandas holoviews bokeh numpy


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
import numpy as np
hv.extension('bokeh')

# Load CSV file
df = pd.read_csv('/content/Clustering.csv',header=0)

# Convert data to long format for Chord Diagram
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = df.iloc[:, i].sum() + df.iloc[:, j].sum()  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Convert to Chord format
nodes = list(set([d[0] for d in data] + [d[1] for d in data]))
node_mapping = {name: i for i, name in enumerate(nodes)}

# Assign random colors to edges for better visualization
colors = hv.Cycle('Category20')  # Choose from 20 different colors

# Convert to Holoviews Chord format
chord_data = hv.Chord((data, hv.Dataset(nodes, 'index')))

# Apply Styling
chord_data.opts(
    opts.Chord(
        cmap='Category20',      # Color scheme for nodes
        edge_color=hv.dim('value'),  # Color edges based on interaction strength
        edge_cmap='Viridis',    # Gradient color for edges
        labels='index',         # Show node labels
        node_color='index',     # Different color for each node
        node_size=15,           # Make nodes larger
        edge_line_width=2,      # Thicker edges
        edge_alpha=0.8,         # Transparency for better visibility
        width=800, height=800   # Adjust figure size
    )
)

# Show the plot
hv.output(chord_data)


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

# Load CSV file
df = pd.read_csv('/content/Clustering.csv',header=0)

# Convert data to long format for Chord Diagram
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = df.iloc[:, i].sum() + df.iloc[:, j].sum()  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Convert to Chord format
nodes = list(set([d[0] for d in data] + [d[1] for d in data]))
node_mapping = {name: i for i, name in enumerate(nodes)}

# Convert to Holoviews Chord format
chord_data = hv.Chord((data, hv.Dataset(nodes, 'index')))

# Apply Styling
chord_data.opts(
    opts.Chord(
        cmap='Category20',           # Different colors for nodes
        edge_cmap='Plasma',          # Gradient color for edges (instead of black)
        edge_color=hv.dim('value'),  # Map color to interaction strength
        labels='index',              # Show node labels
        node_color='index',          # Different color for each node
        node_size=15,                # Increase node size
        edge_line_width=2,           # Make edges thicker
        edge_alpha=0.8,              # Transparency for better visualization
        width=800, height=800        # Adjust figure size
    )
)

# Show the plot
hv.output(chord_data)


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

# Load CSV file
df = pd.read_csv('/content/Clustering.csv',header=0)

# Convert data to long format for Chord Diagram
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = df.iloc[:, i].sum() + df.iloc[:, j].sum()  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Convert to Chord format
nodes = list(set([d[0] for d in data] + [d[1] for d in data]))
node_mapping = {name: i for i, name in enumerate(nodes)}

# Convert to Holoviews Chord format
chord_data = hv.Chord((data, hv.Dataset(nodes, 'index')))

# Apply Styling
chord_data.opts(
    opts.Chord(
        cmap='Category20',           # Different colors for nodes
        edge_cmap='Plasma',          # ðŸŒˆ Gradient color for edges
        edge_color=hv.dim('value'),  # Map color to interaction strength
        labels='index',              # Show node labels
        node_color='index',          # Different color for each node
        node_size=15,                # Increase node size
        edge_line_width=3,           # Make edges thicker
        edge_alpha=0.9,              # Slight transparency
        width=800, height=800        # Adjust figure size
    )
)

# Show the plot
hv.output(chord_data)


In [None]:
import pandas as pd
import holoviews as hv
from holoviews import opts
import numpy as np
hv.extension('bokeh')

# Load CSV file
df = pd.read_csv('/content/Clustering.csv', header=0)

# Convert data to long format for Sankey Diagram
data = []
columns = df.columns

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        value = np.random.randint(1, 100)  # Example interaction strength
        if value > 0:  # If there's interaction
            data.append((columns[i], columns[j], value))

# Create node list
nodes = list(set([d[0] for d in data] + [d[1] for d in data]))
node_mapping = {name: i for i, name in enumerate(nodes)}
# Convert to Holoviews Sankey format
edges = [(node_mapping[d[0]], node_mapping[d[1]], d[2]) for d in data]
# The change is here, we explicitly declare 'name' as a kdim
nodes = hv.Dataset([(i, name) for name, i in node_mapping.items()], 'index', 'name')

# Creating Sankey diagram (alternative to Chord)
sankey = hv.Sankey((edges, nodes)).opts(
    cmap='Category20',             # Unique color for each node
    edge_cmap='Plasma',            # Colored inside curves
    edge_color=hv.dim('value'),    # Map edge color to interaction strength
    labels='name',                 # Show node labels
    node_color='index',            # Assign unique colors to nodes
    node_size=20,                  # Increase node size for visibility
    edge_line_width=3,             # Thicker edges
    edge_alpha=0.8,                # Slight transparency
    width=900, height=900          # Adjust figure size
)

# Show the plot
hv.output(sankey)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_chord_diagram import chord_diagram

# Load CSV data
df = pd.read_csv('/content/Clustering.csv', header=0)

# Generate a random matrix for interactions
size = len(df.columns)
matrix = np.random.randint(1, 100, size=(size, size))

# Define labels
labels = df.columns.tolist()

# Define colors for inside curved edges
cmap = plt.get_cmap("rainbow")  # ðŸŒˆ Different colors for curves
colors = [cmap(i / size) for i in range(size)]

# Create Chord Diagram
fig, ax = plt.subplots(figsize=(10, 10), dpi=300)
chord_diagram(matrix, names=labels, ax=ax, cmap="rainbow", directed=False, chord_colors=colors)

# Adjust and show plot
plt.title("Circular Chord Diagram with Colored Inside Curves")
plt.show()


In [None]:
!pip install mpl-chord-diagram # install mpl_chord_diagram module

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_chord_diagram import chord_diagram # import chord_diagram

# Load CSV data
df = pd.read_csv('/content/Clustering.csv', header=0)

# Generate a random matrix for interactions
size = len(df.columns)
matrix = np.random.randint(1, 100, size=(size, size))

# Define labels
labels = df.columns.tolist()

# Define colors for inside curved edges
cmap = plt.get_cmap("rainbow")  # ðŸŒˆ Different colors for curves
colors = [cmap(i / size) for i in range(size)]

# Create Chord Diagram
fig, ax = plt.subplots(figsize=(10, 10), dpi=300)
chord_diagram(matrix, names=labels, ax=ax, cmap="rainbow", directed=False, chord_colors=colors)

# Adjust and show plot
plt.title("Circular Chord Diagram with Colored Inside Curves")
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_chord_diagram import chord_diagram

# Load CSV data
df = pd.read_csv('/content/Clustering.csv', header=0)

# Generate a random matrix for interactions (you can replace this with your actual data)
size = len(df.columns)
matrix = np.random.randint(1, 100, size=(size, size))

# Define labels (same as your dataframe columns)
labels = df.columns.tolist()

# Define colors for inside curved edges using a colormap
cmap = plt.get_cmap("rainbow")  # ðŸŒˆ Different colors for curves
colors = [cmap(i / size) for i in range(size)]

# Create Chord Diagram
fig, ax = plt.subplots(figsize=(10, 10), dpi=300)
chord_diagram(matrix, names=labels, ax=ax, cmap="rainbow", directed=False, chord_colors=colors)

# Add a color bar to the plot
sm = plt.cm.ScalarMappable(cmap="rainbow", norm=plt.Normalize(vmin=matrix.min(), vmax=matrix.max()))
sm.set_array([])  # Empty array is fine for color bar
cbar = fig.colorbar(sm, ax=ax, orientation='vertical', pad=0.01)
cbar.set_label('Interaction Strength')

# Adjust plot layout and display
plt.title("Circular Chord Diagram with Colored Inside Curves")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features for clustering
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Convert back to DataFrame for plotting
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

# Plot a line graph for all samples
plt.figure(figsize=(10, 6))
for i in range(scaled_df.shape[0]):  # Iterate over rows (samples)
    plt.plot(scaled_df.columns, scaled_df.iloc[i, :], marker='o', linestyle='-')

plt.xlabel('Virulence Factors')
plt.ylabel('Standardized Value')
plt.title('Line Graph of All Samples')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features for clustering
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Convert back to DataFrame for plotting
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

# Generate a color palette
num_samples = scaled_df.shape[0]
colors = sns.color_palette("husl", num_samples)  # Using seaborn color palette

# Plot a line graph for all samples with different colors
plt.figure(figsize=(10, 6))

for i in range(num_samples):
    plt.plot(scaled_df.columns, scaled_df.iloc[i, :], marker='o', linestyle='-', color=colors[i], label=f'Sample {i+1}')

plt.xlabel('Virulence Factors')
plt.ylabel('Standardized Value')
plt.title('Line Graph of All Samples with Color Chart')
plt.xticks(rotation=45)
plt.grid(True)

# Add legend outside the plot
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="Samples")

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Load the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features for clustering
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Convert back to DataFrame for plotting
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

# Generate a color palette
num_samples = scaled_df.shape[0]
colors = sns.color_palette("husl", num_samples)  # Unique colors for each sample

# Plot a line graph for all samples with different colors
plt.figure(figsize=(10, 6))

for i in range(num_samples):
    plt.plot(scaled_df.columns, scaled_df.iloc[i, :], marker='o', linestyle='-', color=colors[i], label=f'Sample {i+1}')

plt.xlabel('Virulence Factors')
plt.ylabel('Standardized Value')
plt.title('Line Graph of All Samples with Color Chart')
plt.xticks(rotation=45)
plt.grid(True)

# **Modify Legend Position and Formatting**
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Samples", ncol=2)  # Adjust ncol for better readability

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features and target (Assume EsxA is the target, change as needed)
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Change this if needed

# Remove missing values
data = data.dropna()

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Train Random Forest Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance scores
feature_importance = rf.feature_importances_
feature_names = features.columns

# Convert to DataFrame for plotting
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# **Plot feature importance as a bar chart**
plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette="viridis")
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

# **Optional: Line Graph for Feature Importance**
plt.figure(figsize=(8, 5))
plt.plot(importance_df['Feature'], importance_df['Importance'], marker='o', linestyle='-', color='b')
plt.title('Feature Importance (Line Graph)')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Load the CSV file
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features and target (Use EsxA as a target or change as needed)
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Change this if needed

# Remove missing values
data = data.dropna()

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Train Random Forest Model with more estimators to create a "forest" effect
rf = RandomForestRegressor(n_estimators=500, random_state=42, max_depth=10)
rf.fit(X_train, y_train)

# Get feature importance scores for each tree in the forest
n_trees = len(rf.estimators_)
feature_importance_matrix = np.zeros((n_trees, len(features.columns)))

for i, tree in enumerate(rf.estimators_):
    feature_importance_matrix[i, :] = tree.feature_importances_

# Convert to DataFrame for better visualization
importance_df = pd.DataFrame(feature_importance_matrix, columns=features.columns)

# **Plot Feature Importance as a "Forest-like" Graph**
plt.figure(figsize=(10, 6))
sns.violinplot(data=importance_df, palette="Set2")  # Creates a violin plot (tree-like structure)
plt.title('Random Forest Feature Importance (Forest-like View)')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.xticks(rotation=45)
plt.grid(True)

plt.show()


In [None]:
from sklearn.tree import plot_tree

# Select a single tree from the forest
plt.figure(figsize=(20, 10))
plot_tree(rf.estimators_[0], feature_names=features.columns, filled=True, rounded=True)
plt.title("Single Decision Tree from Random Forest")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features and target
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Change as needed

# Remove missing values
data = data.dropna()

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Train Random Forest Model
rf = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=5)
rf.fit(X_train, y_train)

# Get feature importances and confidence intervals
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

# Sort features by importance
indices = np.argsort(importances)

# **Plot the "Forest-Like" Random Forest Feature Importance**
plt.figure(figsize=(8, 6))
plt.barh(range(len(indices)), importances[indices], xerr=std[indices], color='forestgreen', alpha=0.7)
plt.yticks(range(len(indices)), [features.columns[i] for i in indices])
plt.xlabel("Feature Importance Score")
plt.title("Random Forest Feature Importance (Meta-Analysis Style)")
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features.columns, palette="Greens_r")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Random Forest Feature Importance (Tree Trunk View)")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features and target
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Adjust target column based on your dataset

# Remove missing values
data = data.dropna()

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get predictions and standard deviation for each sample
predictions = rf.predict(X_test)
std_dev = np.std([tree.predict(X_test) for tree in rf.estimators_], axis=0)

# Create a forest plot-style visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(x=predictions, y=range(len(predictions)), color='green', label="Prediction")  # Predictions
plt.errorbar(predictions, range(len(predictions)), xerr=std_dev, fmt='o', color='black', label="95% CI")  # Error bars
plt.axvline(x=np.mean(predictions), color='red', linestyle='--', label="Mean Prediction")

plt.xlabel("Predicted Value")
plt.ylabel("Samples")
plt.title("Random Forest Predictions for Individual Samples (Forest Plot Style)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [None]:
import seaborn as sns

# Get feature importances for all trees
feature_importances = np.array([tree.feature_importances_ for tree in rf.estimators_])

# Convert to DataFrame
importance_df = pd.DataFrame(feature_importances, columns=features.columns)

# Plot using a violin plot (mimics trees in a forest)
plt.figure(figsize=(12, 6))
sns.violinplot(data=importance_df, palette="Greens")  # Green color to resemble trees
plt.title('Random Forest Feature Importance (Forest-Like View)')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features and target
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Change this based on your dataset

# Remove missing values
data = data.dropna()

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get predictions and standard deviation for each sample
predictions = rf.predict(X_test)
std_dev = np.std([tree.predict(X_test) for tree in rf.estimators_], axis=0)

# Generate unique colors for each sample
colors = sns.color_palette("husl", len(predictions))  # Husl gives distinct colors

# Create a forest plot-style visualization
plt.figure(figsize=(12, 7))
for i in range(len(predictions)):
    plt.errorbar(predictions[i], i, xerr=std_dev[i], fmt='o', color=colors[i], label=f"Sample {i+1}" if i < 10 else "")  # Show legend only for first 10

# Add mean prediction line
plt.axvline(x=np.mean(predictions), color='red', linestyle='--', label="Mean Prediction")

plt.xlabel("Predicted Value")
plt.ylabel("Samples")
plt.title("Random Forest Predictions for All Samples (Forest Plot Style)")
plt.legend(loc="upper left", bbox_to_anchor=(1, 1), fontsize="small", ncol=2)  # Legend outside to avoid clutter
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features and target
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Change based on your dataset

# Remove missing values
data = data.dropna()

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train Random Forest on all samples (No train-test split)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(features_scaled, target)

# Get predictions and standard deviation for all samples
predictions = rf.predict(features_scaled)
std_dev = np.std([tree.predict(features_scaled) for tree in rf.estimators_], axis=0)

# Generate unique colors for each sample
colors = sns.color_palette("husl", len(predictions))  # Distinct colors for each sample

# Create a forest plot-style visualization
plt.figure(figsize=(12, 7))
for i in range(len(predictions)):
    plt.errorbar(predictions[i], i, xerr=std_dev[i], fmt='o', color=colors[i], label=f"Sample {i+1}" if i < 15 else "")  # Show legend for first 15

# Add mean prediction line
plt.axvline(x=np.mean(predictions), color='red', linestyle='--', label="Mean Prediction")

plt.xlabel("Predicted Value")
plt.ylabel("Samples")
plt.title("Random Forest Predictions for All Samples (Forest Plot Style)")

plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features and target
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin']]
target = data['EsxA']  # Change based on your dataset

# Remove missing values
data = data.dropna()

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train Random Forest on all samples (No train-test split)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(features_scaled, target)

# Get predictions and standard deviation for all samples
predictions = rf.predict(features_scaled)
std_dev = np.std([tree.predict(features_scaled) for tree in rf.estimators_], axis=0)

# Generate unique colors for each sample
num_samples = len(predictions)
colors = sns.color_palette("husl", num_samples)  # Distinct color for each sample

# Create a forest plot-style visualization
plt.figure(figsize=(12, 7))
for i in range(num_samples):
    plt.errorbar(predictions[i], i, xerr=std_dev[i], fmt='o', color=colors[i], label=f"Sample {i+1}")

# Add mean prediction line
plt.axvline(x=np.mean(predictions), color='red', linestyle='--', label="Mean Prediction")

plt.xlabel("Predicted Value")
plt.ylabel("Samples")
plt.title("Random Forest Predictions for All Samples (Forest Plot Style)")

# Place legend outside and break into multiple columns
plt.legend(loc="upper left", bbox_to_anchor=(1.05, 1), fontsize="small", ncol=5)  # Adjust columns for better readability
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [None]:
import shap

explainer = shap.Explainer(rf, features_scaled)
shap_values = explainer(features_scaled)

shap.summary_plot(shap_values, features, plot_type="dot")


In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(predictions, kde=True, bins=20, color="skyblue")
plt.axvline(np.mean(predictions), color='red', linestyle='--', label="Mean")
plt.xlabel("Predicted Value")
plt.ylabel("Frequency")
plt.title("Distribution of Predictions")
plt.legend()
plt.show()


In [None]:
import shap

explainer = shap.Explainer(rf, features_scaled)
shap_values = explainer(features_scaled)

# Pick one sample (e.g., sample index 10)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[10].values, features.iloc[10])


In [None]:

!pip install umap-learn # install the umap-learn package
import umap
import numpy as np

reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='euclidean')

# Before applying UMAP, replace NaN values with a suitable value
# For example, you can replace them with the mean of each column
features_scaled = np.nan_to_num(features_scaled, nan=np.nanmean(features_scaled, axis=0))

embedding = reducer.fit_transform(features_scaled)

# Rest of the plotting code

reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='euclidean')
embedding = reducer.fit_transform(features_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(embedding[:, 0], embedding[:, 1], c=predictions, cmap='coolwarm', s=50, alpha=0.8)
plt.colorbar(label="Prediction Value")
plt.title("UMAP Projection of Feature Space")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()

In [None]:
import pandas as pd
from joypy import joyplot
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor  # Assuming this is the model you used

# ... (your existing code to train the RandomForestRegressor rf) ...

# Calculate predictions using the fitted model (rf) on your data
predictions = rf.predict(features_scaled)  # Replace 'features_scaled' with your actual data

# **Create df_plot DataFrame with your features**
# Assuming 'features' is your DataFrame with the features used for prediction:
df_plot = features.copy() # Create a copy of the features dataframe and assign it to df_plot

# Ensure predictions and df_plot have compatible shapes
predictions = predictions[:len(df_plot)]  # Trim predictions to match df_plot length

df_plot["Predictions"] = predictions  # Add predictions to dataframe
joyplot(
    data=df_plot,
    by="Predictions",
    colormap=plt.cm.coolwarm,  # Use plt.cm.coolwarm instead of "coolwarm"
    figsize=(10, 6)
)
plt.title("Ridge Plot: Prediction Distribution Across Features")
plt.show()

In [None]:
df_plot["Predictions_Binned"] = pd.qcut(predictions, q=10, labels=False)  # 10 bins
joyplot(
    data=df_plot,
    by="Predictions_Binned",
    colormap=plt.cm.coolwarm,
    figsize=(10, 6)
)
plt.title("Ridge Plot with Binned Predictions")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from joypy import joyplot

# Bin the predictions into 10 categories
df_plot["Predictions_Binned"] = pd.qcut(predictions, q=10, labels=False)

# Define colormap
colormap = plt.cm.coolwarm
num_bins = df_plot["Predictions_Binned"].nunique()
colors = colormap(np.linspace(0, 1, num_bins))

# Create legend patches
patches = [mpatches.Patch(color=colors[i], label=f"Bin {i}") for i in range(num_bins)]

# Plot ridge plot
fig, ax = plt.subplots(figsize=(10, 6))
joyplot(
    data=df_plot,
    by="Predictions_Binned",
    colormap=colormap,
    figsize=(10, 6)
)

# Add legend
ax.legend(handles=patches, title="Prediction Bins", loc="upper right", fontsize=8)

plt.title("Ridge Plot with Binned Predictions and Color Legend")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from joypy import joyplot

# Load the dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features
features = ['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']
df_plot = data[features].dropna()  # Drop missing values

# Define colormap
colormap = plt.cm.coolwarm
num_features = len(features)
colors = colormap(np.linspace(0, 1, num_features))

# Create legend patches
patches = [mpatches.Patch(color=colors[i], label=features[i]) for i in range(num_features)]

# Create ridge plot
fig, ax = plt.subplots(figsize=(12, 6))
joyplot(
    data=df_plot,
    by=None,  # No grouping, features are used as categories
    colormap=colormap,
    figsize=(12, 6),
    overlap=1.2
)

# Add color legend
ax.legend(handles=patches, title="Features", loc="upper right", fontsize=8)

plt.title("Ridge Plot of Feature Distributions")
plt.show()


In [None]:
!pip install joypy # Install the joypy library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from joypy import joyplot

# Load the dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features
features = ['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']
df_plot = data[features].dropna()  # Drop missing values

# Define colormap
colormap = plt.cm.coolwarm
num_features = len(features)
colors = colormap(np.linspace(0, 1, num_features))

# Create figure with subplots
fig, ax = plt.subplots(figsize=(6, 3))

# Create ridge plot
joyplot(
    data=df_plot,
    by=None,  # No grouping, features are used as categories
    colormap=colormap,
    figsize=(6, 3),
    overlap=1.2,
    ax=ax
)

# Add color legend inside the same plot
patches = [mpatches.Patch(color=colors[i], label=features[i]) for i in range(num_features)]
plt.legend(handles=patches, title="Features", loc="upper right", fontsize=6, bbox_to_anchor=(1.2, 1))

# Set title
plt.title("Ridge Plot of Feature Distributions with Color Legend")

# Show plot
plt.show()

In [None]:
!pip install joypy  # Install the joypy library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from joypy import joyplot

# Load the dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features
features = ['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']
df_plot = data[features].dropna()  # Drop missing values

# Define colormap
colormap = plt.cm.coolwarm
num_features = len(features)
colors = colormap(np.linspace(0, 1, num_features))

# Create figure with subplots
fig, ax = plt.subplots(figsize=(6, 3))

# Create ridge plot
joyplot(
    data=df_plot,
    by=None,  # No grouping, features are used as categories
    colormap=colormap,
    figsize=(6, 3),
    overlap=1.2,
    ax=ax
)

# Add color legend outside the plot
patches = [mpatches.Patch(color=colors[i], label=features[i]) for i in range(num_features)]
plt.legend(handles=patches, title="Features", loc="upper left", fontsize=8, bbox_to_anchor=(1.05, 1))

# Adjust layout to prevent cutting off the legend
plt.subplots_adjust(right=0.75)

# Set title
plt.title("Ridge Plot of Feature Distributions with Color Legend")

# Show plot
plt.show()


In [None]:
!pip install joypy  # Install the joypy library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from joypy import joyplot

# Load the dataset
data = pd.read_csv('/content/Clustering.csv', header=0)

# Select features
features = ['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']
df_plot = data[features].dropna()  # Drop missing values

# Define colormap
colormap = plt.cm.coolwarm
num_features = len(features)
colors = colormap(np.linspace(0, 1, num_features))

# Create figure with subplots
fig, ax = plt.subplots(figsize=(6, 3))

# Create ridge plot
joyplot(
    data=df_plot,
    by=None,  # No grouping, features are used as categories
    colormap=colormap,
    figsize=(6, 3),
    overlap=1.2,
    ax=ax
)

# Add a smaller legend outside the plot
patches = [mpatches.Patch(color=colors[i], label=features[i]) for i in range(num_features)]
plt.legend(
    handles=patches, title="Features", loc="upper left",
    fontsize=6, title_fontsize=7, bbox_to_anchor=(1.05, 1)
)

# Adjust layout to prevent legend cutoff
plt.subplots_adjust(right=0.75)

# Set title
plt.title("Ridge Plot of Feature Distributions with Color Legend")

# Show plot
plt.show()


#########Elbow##########

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Perform PCA (replace with actual scaled features)
pca_result = PCA().fit(scaled_features)  # Ensure 'scaled_features' is defined

# Compute variance explained for each PC
scree_data = pd.DataFrame({
    'PC': range(1, len(pca_result.explained_variance_) + 1),
    'Variance': pca_result.explained_variance_ratio_
})

# Find the elbow point
knee_locator = KneeLocator(scree_data['PC'], scree_data['Variance'], curve="concave", direction="decreasing")
elbow_pc = knee_locator.elbow  # Optimal number of PCs

# Create the Scree Plot
plt.figure(figsize=(8, 6))
plt.plot(scree_data['PC'], scree_data['Variance'], marker='o', linestyle='-', color='black', markersize=3)  # Thin black line & points

# Mark the elbow point
if elbow_pc is not None:
    plt.scatter(elbow_pc, scree_data['Variance'][elbow_pc - 1], s=30, c='black')  # Thin black elbow point

plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance')
plt.title('Scree Plot (Elbow Method)')

# Annotate the elbow point
if elbow_pc is not None:
    plt.annotate(f'Elbow at PC {elbow_pc}',
                 (elbow_pc, scree_data['Variance'][elbow_pc - 1]),
                 textcoords="offset points",
                 xytext=(0, 10),  # Adjust annotation position
                 ha='center')

plt.grid(False)
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt

# Load data
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Number of clusters can be adjusted

# Fit KMeans to the scaled features and get cluster labels
cluster_labels = kmeans.fit_predict(scaled_features)

# Add the cluster labels to the original DataFrame, aligning with the correct index
data['Cluster'] = -1  # Initialize with a placeholder
data.loc[features.index, 'Cluster'] = cluster_labels  # Assign to the same rows used for clustering


# Markov Chain Model: Estimating Transition Probabilities
# We'll consider the sequence of cluster assignments as states in the Markov Chain

# Create a transition matrix based on the cluster sequence
transition_matrix = np.zeros((3, 3))  # Assuming 3 clusters

# Iterate through the rows and calculate transitions
for i in range(len(data) - 1):
    current_state = data['Cluster'].iloc[i]
    next_state = data['Cluster'].iloc[i + 1]

    # Check if current or next state is -1 (placeholder) and skip
    if current_state == -1 or next_state == -1:
        continue

    transition_matrix[current_state, next_state] += 1

# Normalize the transition matrix by the row sums (so each row sums to 1)
row_sums = transition_matrix.sum(axis=1)
transition_matrix_normalized = transition_matrix / row_sums[:, np.newaxis]

# Print the normalized transition matrix
print("Transition Matrix (Normalized):")
print(transition_matrix_normalized)

# Visualize the transition matrix as a heatmap
plt.figure(figsize=(6, 5))
plt.imshow(transition_matrix_normalized, cmap='Blues', interpolation='nearest')
plt.colorbar(label='Transition Probability')
plt.title('Markov Chain Transition Matrix')
plt.xlabel('Next State')
plt.ylabel('Current State')
plt.xticks(np.arange(3), ['Cluster 0', 'Cluster 1', 'Cluster 2'])
plt.yticks(np.arange(3), ['Cluster 0', 'Cluster 1', 'Cluster 2'])
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import networkx as nx

# Load data
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Number of clusters can be adjusted

# Fit KMeans to the scaled features and get cluster labels
cluster_labels = kmeans.fit_predict(scaled_features)

# Add the cluster labels to the original DataFrame, aligning with the correct index
data['Cluster'] = -1  # Initialize with a placeholder
data.loc[features.index, 'Cluster'] = cluster_labels  # Assign to the same rows used for clustering

# Markov Chain Model: Estimating Transition Probabilities
# We'll consider the sequence of cluster assignments as states in the Markov Chain

# Create a transition matrix based on the cluster sequence
transition_matrix = np.zeros((3, 3))  # Assuming 3 clusters

# Iterate through the rows and calculate transitions
for i in range(len(data) - 1):
    current_state = data['Cluster'].iloc[i]
    next_state = data['Cluster'].iloc[i + 1]

    # Check if current or next state is -1 (placeholder) and skip
    if current_state == -1 or next_state == -1:
        continue

    transition_matrix[current_state, next_state] += 1

# Normalize the transition matrix by the row sums (so each row sums to 1)
row_sums = transition_matrix.sum(axis=1)
transition_matrix_normalized = transition_matrix / row_sums[:, np.newaxis]

# Print the normalized transition matrix
print("Transition Matrix (Normalized):")
print(transition_matrix_normalized)

# Visualize the transition flow diagram using networkx
G = nx.DiGraph()

# Add nodes for each cluster
for i in range(3):
    G.add_node(f'Cluster {i}')

# Add edges with weights (transition probabilities)
for i in range(3):
    for j in range(3):
        if transition_matrix_normalized[i, j] > 0:
            G.add_edge(f'Cluster {i}', f'Cluster {j}', weight=transition_matrix_normalized[i, j])

# Plot the directed graph (flow diagram)
plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G, seed=42)  # Layout for the graph
edges = G.edges(data=True)
weights = [edata['weight'] for u, v, edata in edges]

nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=12, font_weight='bold', arrowsize=20)
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): f'{edata["weight"]:.2f}' for u, v, edata in edges})

plt.title('Markov Chain Transition Flow Diagram')
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import networkx as nx

# Load data
data = pd.read_csv('/content/Clustering.csv', header=0)

# Extract features
features = data[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Discretize the features into discrete states (e.g., using quantiles or custom thresholds)
n_states = 4  # Number of discrete states (bins)
discrete_features = np.digitize(scaled_features, bins=np.linspace(-3, 3, n_states))

# Initialize a transition matrix for each feature (6 features)
transition_matrices = {feature: np.zeros((n_states, n_states)) for feature in features.columns}

# Iterate through the rows and calculate transitions
for i in range(len(features) - 1):
    for j, feature in enumerate(features.columns):
        current_state = discrete_features[i, j]
        next_state = discrete_features[i + 1, j]
        transition_matrices[feature][current_state, next_state] += 1

# Normalize the transition matrices by the row sums (so each row sums to 1)
for feature in features.columns:
    row_sums = transition_matrices[feature].sum(axis=1)
    transition_matrices[feature] /= row_sums[:, np.newaxis]

# Print the normalized transition matrices for each feature
for feature in features.columns:
    print(f"Transition Matrix for {feature}:")
    print(transition_matrices[feature])

# Visualization of Transition Flow Diagrams for each feature
for feature in features.columns:
    G = nx.DiGraph()

    # Add nodes for each state (discrete state)
    for i in range(n_states):
        G.add_node(f'State {i+1}')

    # Add edges with weights (transition probabilities)
    for i in range(n_states):
        for j in range(n_states):
            if transition_matrices[feature][i, j] > 0:
                G.add_edge(f'State {i+1}', f'State {j+1}', weight=transition_matrices[feature][i, j])

    # Plot the directed graph (flow diagram) for each feature
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(G, seed=42)  # Layout for the graph
    edges = G.edges(data=True)
    weights = [edata['weight'] for u, v, edata in edges]

    nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=12, font_weight='bold', arrowsize=20)
    nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): f'{edata["weight"]:.2f}' for u, v, edata in edges})

    plt.title(f'Markov Chain Transition Flow Diagram for {feature}')
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler

# Load Data
df = pd.read_csv("/content/Clustering.csv", header=0)

# Extract Features
features = df[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Discretize the features into discrete states (e.g., using quantiles or custom thresholds)
n_states = 4  # Number of discrete states (bins)
discrete_features = np.digitize(scaled_features, bins=np.linspace(-3, 3, n_states))

# Compute Co-Occurrence Matrix (Transition Counts)
transition_matrix = np.zeros((n_states, n_states))  # Initialize the transition matrix for n_states

# Iterate through the rows and calculate transitions
for i in range(len(features) - 1):
    for j in range(features.shape[1]):  # For each feature
        current_state = discrete_features[i, j]
        next_state = discrete_features[i + 1, j]
        transition_matrix[current_state, next_state] += 1

# Normalize the transition matrix (convert counts to probabilities)
transition_df = pd.DataFrame(transition_matrix, index=[f"State {i+1}" for i in range(n_states)],
                             columns=[f"State {i+1}" for i in range(n_states)])
transition_df = transition_df.div(transition_df.sum(axis=1), axis=0).fillna(0)

# Create Markov Chain Graph
G = nx.DiGraph()
for i in transition_df.index:
    for j in transition_df.columns:
        if transition_df.loc[i, j] > 0:  # Only add transitions with probability > 0
            G.add_edge(i, j, weight=transition_df.loc[i, j])

# Generate Pastel Colors for Nodes (States)
colors = cm.Pastel1(np.linspace(0, 1, len(transition_df.columns)))

# Assign colors to nodes in the graph
node_color_map = {state: color for state, color in zip(transition_df.columns, colors)}

# Draw Graph with Pastel Colors
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)
edges = G.edges(data=True)
edge_weights = [d['weight'] * 5 for (u, v, d) in edges]  # Scale edge width

# Draw nodes with pastel colors
node_colors = [node_color_map[node] for node in G.nodes]
nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color="gray", width=edge_weights, node_size=2000)

plt.title("Markov Chain Model: Feature Transitions (Pastel Colors)")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler

# Load Data
df = pd.read_csv("/content/Clustering.csv", header=0)

# Extract Features
features = df[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Discretize the features into discrete states (e.g., using quantiles or custom thresholds)
n_states = 4  # Number of discrete states (bins)
discrete_features = np.digitize(scaled_features, bins=np.linspace(-3, 3, n_states))

# Compute Co-Occurrence Matrix (Transition Counts)
# Each column corresponds to a specific feature, and each row to a discretized state
transition_matrix = np.zeros((features.shape[1], n_states, n_states))  # For each feature

# Iterate through the rows and calculate transitions for each feature
for i in range(len(features) - 1):
    for j in range(features.shape[1]):  # For each feature
        current_state = discrete_features[i, j]
        next_state = discrete_features[i + 1, j]
        transition_matrix[j, current_state, next_state] += 1

# Normalize the transition matrix (convert counts to probabilities)
transition_df = pd.DataFrame()
for feature_idx in range(features.shape[1]):
    matrix = transition_matrix[feature_idx]
    matrix = matrix / matrix.sum(axis=1, keepdims=True)  # Normalize each row
    transition_df[features.columns[feature_idx]] = matrix.flatten()

# Create Markov Chain Graph
G = nx.DiGraph()

# Create edges based on transitions for each feature
for feature_idx in range(features.shape[1]):
    feature_name = features.columns[feature_idx]
    for i in range(n_states):
        for j in range(n_states):
            weight = transition_matrix[feature_idx, i, j]
            if weight > 0:  # Only add transitions with non-zero probability
                G.add_edge(f"{feature_name} - State {i+1}", f"{feature_name} - State {j+1}", weight=weight)

# Generate Pastel Colors for Nodes (States)
colors = cm.Pastel1(np.linspace(0, 1, len(G.nodes)))

# Assign colors to nodes in the graph
node_color_map = {node: color for node, color in zip(G.nodes, colors)}

# Draw Graph with Pastel Colors
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)
edges = G.edges(data=True)
edge_weights = [d['weight'] * 5 for (u, v, d) in edges]  # Scale edge width

# Draw nodes with pastel colors
node_colors = [node_color_map[node] for node in G.nodes]
nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color="gray", width=edge_weights, node_size=2000)

plt.title("Markov Chain Model: Feature Transitions (Pastel Colors with States)")
plt.show()



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler

# Load Data
df = pd.read_csv("/content/Clustering.csv", header=0)

# Extract Features
features = df[['Lipase', 'Protease', 'Hemolysin', 'DNase', 'Staphyloxanthin', 'EsxA']]

# Remove rows with any missing values
features = features.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Discretize the features into discrete states (e.g., using quantiles or custom thresholds)
n_states = 4  # Number of discrete states (bins)
discrete_features = np.digitize(scaled_features, bins=np.linspace(-3, 3, n_states))

# Compute Co-Occurrence Matrix (Transition Counts)
transition_matrix = np.zeros((features.shape[1], n_states, n_states))  # For each feature

# Iterate through the rows and calculate transitions for each feature
for i in range(len(features) - 1):
    for j in range(features.shape[1]):  # For each feature
        current_state = discrete_features[i, j]
        next_state = discrete_features[i + 1, j]
        transition_matrix[j, current_state, next_state] += 1

# Normalize the transition matrix (convert counts to probabilities)
transition_df = pd.DataFrame()
for feature_idx in range(features.shape[1]):
    matrix = transition_matrix[feature_idx]
    matrix = matrix / matrix.sum(axis=1, keepdims=True)  # Normalize each row
    transition_df[features.columns[feature_idx]] = matrix.flatten()

# Create Markov Chain Graph
G = nx.DiGraph()

# Create edges based on transitions for each feature
for feature_idx in range(features.shape[1]):
    feature_name = features.columns[feature_idx]
    for i in range(n_states):
        for j in range(n_states):
            weight = transition_matrix[feature_idx, i, j]
            if weight > 0:  # Only add transitions with non-zero probability
                G.add_edge(f"{feature_name} - State {i+1}", f"{feature_name} - State {j+1}", weight=weight)

# Generate Pastel Colors for Nodes (States)
colors = cm.Pastel1(np.linspace(0, 1, len(G.nodes)))

# Assign colors to nodes in the graph
node_color_map = {node: color for node, color in zip(G.nodes, colors)}

# Draw Graph with Pastel Colors
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)
edges = G.edges(data=True)
edge_weights = [d['weight'] * 5 for (u, v, d) in edges]  # Scale edge width

# Draw nodes with pastel colors
node_colors = [node_color_map[node] for node in G.nodes]
nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color="gray", width=edge_weights, node_size=2000)

plt.title("Markov Chain Model: Feature Transitions (Pastel Colors with States)")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.cm as cm

# Load your Clustering Data (Replace with correct file path if necessary)
df = pd.read_csv("/content/Clustering.csv", header=0)

# Replace resistance values if applicable (or adjust for your features)
# Example: if features are categorical, you can binarize them or treat as continuous
df_binary = df.applymap(lambda x: 1 if x > 0 else 0)  # Adjust this for your dataset logic

# Compute Co-Occurrence Matrix (Transition Counts)
# Co-occurrence is calculated by dot product of binary DataFrame (rows x columns)
transition_matrix = np.dot(df_binary.T, df_binary)  # Co-occurrence matrix
transition_df = pd.DataFrame(transition_matrix, index=df.columns, columns=df.columns)

# Normalize Each Row (Convert Counts to Probabilities)
transition_df = transition_df.div(transition_df.sum(axis=1), axis=0).fillna(0)  # Normalize the rows

# Create Markov Chain Graph
G = nx.DiGraph()

# Add edges to the graph based on the co-occurrence matrix with weights
for i in transition_df.index:
    for j in transition_df.columns:
        if transition_df.loc[i, j] > 0:  # Only add transitions with probability > 0
            G.add_edge(i, j, weight=transition_df.loc[i, j])

# Generate Pastel Colors for Nodes (Features)
colors = cm.Pastel1(np.linspace(0, 1, len(df.columns)))

# Assign colors to nodes in the graph (feature names)
node_color_map = {feature: color for feature, color in zip(df.columns, colors)}

# Draw Graph with Pastel Colors
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)  # Positioning nodes using spring layout
edges = G.edges(data=True)
edge_weights = [d['weight'] * 5 for (u, v, d) in edges]  # Scale edge width for better visualization

# Draw nodes with pastel colors
node_colors = [node_color_map[node] for node in G.nodes]
nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color="gray", width=edge_weights, node_size=2000)

# Add title to the plot
plt.title("Markov Chain Model: Feature Co-occurrence Transition (Pastel Colors)")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.cm as cm
from numpy.linalg import eig

# Load your Clustering Data (Replace with correct file path if necessary)
df = pd.read_csv("/content/Clustering.csv", header=0)

# Replace resistance values if applicable (or adjust for your features)
df_binary = df.applymap(lambda x: 1 if x > 0 else 0)  # Adjust this for your dataset logic

# Compute Co-Occurrence Matrix (Transition Counts)
transition_matrix = np.dot(df_binary.T, df_binary)  # Co-occurrence matrix
transition_df = pd.DataFrame(transition_matrix, index=df.columns, columns=df.columns)

# Normalize Each Row (Convert Counts to Probabilities)
transition_df = transition_df.div(transition_df.sum(axis=1), axis=0).fillna(0)  # Normalize the rows

# Create Markov Chain Graph
G = nx.DiGraph()

# Add edges to the graph based on the co-occurrence matrix with weights
for i in transition_df.index:
    for j in transition_df.columns:
        if transition_df.loc[i, j] > 0:  # Only add transitions with probability > 0
            G.add_edge(i, j, weight=transition_df.loc[i, j])

# Generate Pastel Colors for Nodes (Features)
colors = cm.Pastel1(np.linspace(0, 1, len(df.columns)))

# Assign colors to nodes in the graph (feature names)
node_color_map = {feature: color for feature, color in zip(df.columns, colors)}

# Draw Graph with Pastel Colors
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)  # Positioning nodes using spring layout
edges = G.edges(data=True)
edge_weights = [d['weight'] * 5 for (u, v, d) in edges]  # Scale edge width for better visualization

# Draw nodes with pastel colors
node_colors = [node_color_map[node] for node in G.nodes]
nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color="gray", width=edge_weights, node_size=2000)

# Add title to the plot
plt.title("Markov Chain Model: Feature Co-occurrence Transition (Pastel Colors)")
plt.show()

# Calculate Stationary Distribution (Eigenvector for eigenvalue 1)
transition_matrix_normalized = transition_df.to_numpy()
eigvals, eigvecs = eig(transition_matrix_normalized.T)  # Transpose for right eigenvectors

# Find the eigenvector corresponding to eigenvalue 1 (stationary distribution)
stationary_vector = eigvecs[:, np.isclose(eigvals, 1)].real.flatten()

# Normalize the stationary vector (it should sum to 1)
stationary_distribution = stationary_vector / stationary_vector.sum()

# Print the stationary distribution (long-term probabilities)
print("\nStationary Distribution (Long-Term Probabilities):")
for feature, prob in zip(df.columns, stationary_distribution):
    print(f"{feature}: {prob:.4f}")

# First-Passage Time Calculation (Expected Number of Steps to Reach a State)
# This involves computing the inverse of the transition matrix, which can be complex.
# Here, we will compute the mean first-passage times for each pair of states using a simpler method.
first_passage_matrix = np.linalg.inv(np.eye(transition_matrix_normalized.shape[0]) - transition_matrix_normalized)
first_passage_df = pd.DataFrame(first_passage_matrix, index=df.columns, columns=df.columns)

# Visualize the First-Passage Times Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(first_passage_df, annot=True, cmap="YlGnBu", fmt=".2f", cbar_kws={'label': 'First-Passage Time'})
plt.title("First-Passage Times Matrix")
plt.show()

