In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from modules.process_data import *
from modules.utils import *

  from .autonotebook import tqdm as notebook_tqdm
2024-06-06 18:52:08.983320: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
adata = anndata.read_h5ad("data/fede_count.h5ad")
anno_df = pd.read_csv("data/fede_mapping.csv", skiprows=4)

adata = rm_high_mt(adata, threshold=0.6)
adata = filter_cells_by_gene_counts(adata)
#sc.pp.normalize_total(adata)
#sc.pp.log1p(adata)
#sc.pp.scale(adata)

sc_df = pd.DataFrame(adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X, index=adata.obs_names, columns=adata.var_names)
sample_tags = pd.DataFrame(adata.obs.Sample_Tag)
sc_df = sc_df.join(sample_tags)

anno_df = anno_df.set_index('cell_id')[['class_name']]
anno_df = anno_df['class_name'].map(mapping1)
sc_df.index = sc_df.index.astype('int64')
anno_df.index = anno_df.index.astype('int64')
sc_df = sc_df.join(anno_df)

sc_df = sc_df[~sc_df['Sample_Tag'].isin(['Multiplet', 'Undetermined'])]


sc_df = sc_df[sc_df['Sample_Tag'].isin(['SampleTag17_flex'])]


X = sc_df.drop(['Sample_Tag', 'class_name'], axis=1).values
#Y = sc_df['Sample_Tag'].map(mapping2).astype(np.float32).values
Y = sc_df['class_name'].astype(np.float32).values

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming your data is in a variable called X
# Perform PCA and store explained variance ratios
pca = PCA()
pca.fit(X)
explained_variance = np.cumsum(pca.explained_variance_ratio_)

explained_variance_100 = explained_variance[:100]

# Plot the explained variance to create an elbow plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_100) + 1), explained_variance_100, marker='o', markersize=3, linestyle='-')
plt.axhline(y=0.90, color='r', linestyle='--')  # 90% explained variance threshold
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance')
plt.title('Elbow plot for PCA')
plt.grid(True)
plt.savefig("elbow_plot_pca.png")
plt.show()

In [None]:
# Assuming your data is in a variable called X
# Perform PCA and store explained variance ratios
pca = PCA()
pca.fit(X)
explained_variance_ratio = pca.explained_variance_ratio_

# Limit to the first 100 dimensions
explained_variance_ratio_100 = explained_variance_ratio[:100]

# Plot the explained variance ratio to create a scree plot
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance_ratio_100) + 1), explained_variance_ratio_100, alpha=0.7, align='center')
plt.xlabel('Principal component')
plt.ylabel('Explained variance ratio')
plt.title('Scree plot')
plt.grid(True)
plt.savefig("scree_plot_pca.png")
plt.show()

In [None]:
# Use PCA to reduce dimensionality to the optimal number of components
optimal_components = 17
pca = PCA(n_components=optimal_components)
X_pca = pca.fit_transform(X)

In [None]:
def visualize_umap(X, Y, mapping, n_neighbors_list):
    unique_targets = np.unique(Y)
    colors = plt.cm.jet(np.linspace(0, 1, len(unique_targets)))
    markersize_scatter = 0.1
    markersize_legend = 10

    fig, axes = plt.subplots(1, len(n_neighbors_list), figsize=(20, 6))
    
    for ax, n_neighbors in zip(axes, n_neighbors_list):
        umap_2d = umap.UMAP(n_neighbors=n_neighbors, n_components=2, random_state=42)
        X_umap = umap_2d.fit_transform(X)

        for target, color in zip(unique_targets, colors):
            indices = np.where(Y == target)
            ax.scatter(X_umap[indices, 0], X_umap[indices, 1], color=color, label=mapping[target], s=markersize_scatter)
        
        ax.set_title(f'n_neighbors = {n_neighbors}')
        ax.set_xlabel('UMAP Dimension 1')
        ax.set_ylabel('UMAP Dimension 2')

    handles = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=markersize_legend, label=mapping[target])
               for target, color in zip(unique_targets, colors)]
    
    fig.legend(handles=handles, loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.grid(False)
    plt.savefig('umap_optimal_neighbors.png', bbox_inches='tight')
    plt.show()

In [None]:
# Plot UMAP results for different n_neighbors values
n_neighbors_list = [10, 25, 50, 75, 100, 150]
visualize_umap(X_pca, Y, mapping, n_neighbors_list)

In [None]:
df = sc_df.drop(['Sample_Tag', 'class_name'], axis=1)

In [None]:
import pandas as pd

# Assuming df is your DataFrame
# Filter out columns starting with 'mt-'
filtered_df = df.loc[:, ~df.columns.str.startswith('mt-')]

# Calculate the mean of each column in the filtered DataFrame
mean_values = filtered_df.mean()

# Get the top 10 columns with the highest mean value
top_10_columns = mean_values.nlargest(10)

# Print the top 10 columns
print(top_10_columns)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
gene_id = 'Malat1'
# Check if 'Malat1' is a column in the DataFrame
if gene_id in df.columns:
    # Print basic statistical measures
    print(df[gene_id].describe())
    
    # Plot the distribution using a histogram
    plt.figure(figsize=(10, 6))
    plt.hist(df[gene_id], bins=100, edgecolor='black')
    plt.title(f'Distribution of {gene_id} gene read count')
    plt.xlabel(f'{gene_id} read count')
    plt.ylabel('Number of cells')
    plt.savefig(f"histo_{gene_id}_counts.png")
    plt.show()
    
    # Plot the distribution using a boxplot
    plt.figure(figsize=(10, 6))
    plt.boxplot(df[gene_id], vert=False)
    plt.title(f'Boxplot of {gene_id} gene read count')
    plt.xlabel(f'{gene_id} read count')
    plt.ylabel('Number of cells')
    plt.savefig(f"boxplot_{gene_id}_counts.png")
    plt.show()

In [10]:
import scanpy as sc
import pandas as pd
import numpy as np

# Assuming you have a count matrix `count_data` and sample information `sample_info` in pandas DataFrames

# Create an AnnData object
# Let's assume count_data is your gene expression matrix (samples x genes)
# and sample_info contains a column 'condition' with the experimental conditions

count_data = pd.DataFrame({
    'gene1': [100, 200, 150, 300],
    'gene2': [400, 500, 450, 350],
    'gene3': [300, 200, 250, 150],
    'gene4': [100, 100, 100, 100]
}, index=['sample1', 'sample2', 'sample3', 'sample4']).astype(float)  # Ensure data is float

sample_info = pd.DataFrame({
    'condition': ['A', 'A', 'B', 'B']
}, index=['sample1', 'sample2', 'sample3', 'sample4'])

# Create an AnnData object
adata = sc.AnnData(count_data.values, obs=sample_info)

# Set the variable names (gene names)
adata.var_names = count_data.columns

# Set the observation names (sample names)
adata.obs_names = count_data.index

# Log-normalize the data
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)

# Perform differential expression analysis
sc.tl.rank_genes_groups(adata, groupby='condition', method='wilcoxon')

# Extract the results
results = adata.uns['rank_genes_groups']

# Convert results to a pandas DataFrame
gene_names = results['names'].tolist()
pvals = results['pvals'].tolist()
pvals_adj = results['pvals_adj'].tolist()  # Adjusted p-values
logfoldchanges = results['logfoldchanges'].tolist()

# Create a DataFrame with the results
df_results = pd.DataFrame({
    'gene': gene_names[0],
    'pvals': pvals[0],
    'pvals_adj': pvals_adj[0],  # Add adjusted p-values
    'logfoldchanges': logfoldchanges[0]
})

# Print the results
print(df_results)


    gene     pvals  pvals_adj  logfoldchanges
0  gene3  0.438578   0.584771        0.302036
1  gene1  0.438578   0.584771        0.621965


In [None]:
!python "modules/train_script.py" 256 15 0.0001 5 0 kan -s_layers 256 32

In [None]:
!python "modules/eval_script.py" kan kan_0_5_32.pth 0 -s_layers 256 32

In [None]:
X, Y = pickle.load(open('embed_kan_0_5_32.pth.pkl', 'rb'))

In [None]:
#PCA dim. reduction
#pca = PCA(n_components=10, random_state=42)
#pca_result = pca.fit_transform(X)

In [None]:
#UMAP dim. reduction
reducer = UMAP(n_neighbors=100, n_components=2, random_state=42)
umap_result = reducer.fit_transform(X)

In [None]:
mapping2 = {'LD_5xFAD': 1,
            'LD_NC': 2,
            'run_5xFAD': 3,
            'run_NC': 4}

mapping2 = {'Multiplet': 1,
            'SampleTag17_flex': 2,
            'SampleTag18_flex': 3,
            'SampleTag19_flex': 4,
            'SampleTag20_flex': 5,
            'Undetermined': 6}

In [None]:
mapping = {y:x for x,y in mapping1.items()}

In [None]:
visualize_umap(umap_result, Y, mapping)

In [None]:
#KMEANS clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_result = kmeans.fit_predict(umap_result)

In [None]:
#DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=100)
dbscan_result = dbscan.fit_predict(umap_result)

In [None]:
get_clustering(umap_result, dbscan_result)

In [None]:
cluster_composition = get_cluster_composition(dbscan_result, Y)

In [None]:
plot_cluster_composition(cluster_composition, mapping)

In [None]:
get_pie_chart(cluster_composition[2], mapping, min_pct=5)

In [None]:
from torch import nn
from modules.kan_model import DeepKAN
import torch
from matplotlib import pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
input_dim = 29535
shared_layers = [1024,256,32]
num_knots = 5
spline_order = 3
noise_scale = 0.1
base_scale = 1.0
spline_scale = 1.0
activation = nn.SiLU
grid_epsilon = 0.02
grid_range = [-1, 1]

In [None]:
base_net = DeepKAN(input_dim, shared_layers, num_knots, spline_order,
                noise_scale, base_scale, spline_scale, activation, grid_epsilon, grid_range)

In [None]:
model_path = "kan_0_5_32.pth"
checkpoint = torch.load(model_path, map_location=device)
base_net.load_state_dict(checkpoint)
base_net.to(device)

In [None]:
# Prune the network
pruning_threshold = 0.01
base_net.prune_network(pruning_threshold)
mask = [(param != 0).float() for param in base_net.parameters()]
def apply_pruning_mask(model, mask):
    with torch.no_grad():
        for param, mask in zip(model.parameters(), mask):
            param.mul_(mask)
apply_pruning_mask(base_net, mask)

In [None]:
def print_network_parameters(model):
    for i, layer in enumerate(model.layers):
        print(f"Layer {i + 1}:")
        print("Base Weights:")
        print(layer.base_weights.data)
        print("Spline Weights:")
        print(layer.spline_weights.data)
        if layer.standalone_spline_scaling:
            print("Spline Scales:")
            print(layer.spline_scales.data)
        print("-" * 50)

In [None]:
print_network_parameters(base_net)

In [None]:
plt.plot(base_net.layers[0].spline_weights[0][0].cpu().detach().numpy())