In [None]:
import anndata
import scanpy as sc
import pandas as pd
from scipy import sparse
from modules.process_data import *
from collections import Counter
from matplotlib.ticker import PercentFormatter

In [None]:
adata = anndata.read_h5ad("data/fede_count.h5ad")

In [None]:
anno_df = pd.read_csv("data/fede_mapping.csv", skiprows=4)

In [None]:
sc_df['Sample_Tag'].map(mapping2)

In [None]:
sc_df['Sample_Tag'].values

In [None]:
data = Counter([x for x in adata.obs.Sample_Tag.values])
sorted_data = dict(sorted(data.items(), key=lambda item: item[1]))
labels = sorted_data.keys()
sizes = sorted_data.values()
cmap = plt.get_cmap("viridis")
colors = cmap(np.linspace(0, 1, len(labels)))
plt.figure(figsize=(10, 7))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
plt.axis('equal')  
plt.title('Sample tag distribution')
plt.show()

In [None]:
sc_df = pd.DataFrame(adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X, index=adata.obs_names, columns=adata.var_names)

In [None]:
sample_tags = pd.DataFrame(adata.obs.Sample_Tag)

In [None]:
sc_df = sc_df.join(sample_tags)

In [None]:
mito_genes = [col for col in sc_df.columns if col.startswith('mt-')]
mito_reads = sc_df[mito_genes].sum(axis=1)
numeric_df = sc_df.select_dtypes(include=[int, float])
total_reads = numeric_df.sum(axis=1)
mito_percentage = (mito_reads / total_reads) * 100
mito_percentage_list = mito_percentage.tolist()
plt.xlabel('Percentage of counts mapping to mitochondrial genes')
plt.ylabel('Number of cells')
plt.hist(mito_percentage_list, bins=20)
plt.show()

In [None]:
mito_genes = [col for col in sc_df.columns if col.startswith('mt-')]
numeric_df = sc_df.select_dtypes(include=[int, float])
results = {}
grouped = sc_df.groupby('Sample_Tag')
for sample_tag, group in grouped:
    group_numeric = group.select_dtypes(include=[int, float])
    mito_reads = group[mito_genes].sum(axis=1)
    total_reads = group_numeric.sum(axis=1)
    mito_percentage = (mito_reads / total_reads) * 100
    results[sample_tag] = mito_percentage.tolist()

    plt.xlabel('Percentage of counts mapping to mitochondrial genes')
    plt.ylabel('Number of cells')
    plt.title(f'{sample_tag}')
    plt.hist(results[sample_tag], bins=20)
    plt.show()

In [None]:
anno_df = anno_df.set_index('cell_id')[['class_name']]

In [None]:
sc_df.index = sc_df.index.astype('int64')

In [None]:
sc_df = sc_df.join(anno_df)

In [None]:
class_name_values = sc_df['class_name'].values
class_counts = Counter(class_name_values)
total_counts = sum(class_counts.values())
threshold = 0.01
other_count = 0
final_counts = {}
for class_name, count in class_counts.items():
    percentage = count / total_counts
    if percentage < threshold:
        other_count += count
    else:
        final_counts[class_name] = count

if other_count > 0:
    final_counts['Others'] = other_count

sorted_counts = dict(sorted(final_counts.items(), key=lambda item: item[1]))
labels = sorted_counts.keys()
sizes = sorted_counts.values()
cmap = plt.get_cmap("viridis")
colors = cmap(np.linspace(0, 1, len(labels)))
plt.figure(figsize=(10, 7))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
plt.axis('equal')  
plt.title('Cell type distribution')
plt.show()

In [None]:
mito_genes = [col for col in sc_df.columns if col.startswith('mt-')]
mito_reads = sc_df[mito_genes].sum(axis=1)
numeric_df = sc_df.select_dtypes(include=[int, float])
total_reads = numeric_df.sum(axis=1)
sc_df['mito_percentage'] = (mito_reads / total_reads) * 100

top_classes = sc_df['class_name'].value_counts().nlargest(5).index

fig, axes = plt.subplots(len(top_classes), 1, figsize=(8, 5 * len(top_classes)), sharex=True)

for ax, class_name in zip(axes, top_classes):
    class_data = sc_df[sc_df['class_name'] == class_name]['mito_percentage']
    ax.hist(class_data, bins=20, alpha=0.7, color='blue')
    ax.set_title(f'{class_name}')
    ax.set_xlabel('Percentage of counts mapping to mitochondrial genes')
    ax.set_ylabel('Number of cells')
    ax.xaxis.set_major_formatter(PercentFormatter())

plt.tight_layout()
plt.show()

In [None]:
sc_df.drop(['Sample_Tag', 'class_name'], axis=1)