In [1]:
import pandas as pd
import os


In [2]:
import pandas as pd
import os

# 设置文件夹路径
folder_path = '../data/processed'

# 初始化一个空的列表来存储结果
results = []

# 遍历文件夹中的所有CSV文件
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        
        # 读取CSV文件
        df = pd.read_csv(file_path)
        
        # 统计数据总数
        total_count = len(df)
        
        # 统计不同标签的数量和每个标签的数量
        label_counts = df['label'].value_counts()
        
        # 计算每个标签的占比
        label_proportions = label_counts / total_count
        
        # 计算样本0对1的比例，并转换为整数比
        label_0_proportion = label_proportions.get(0, 0)
        label_1_proportion = label_proportions.get(1, 1)  # 确保标签1占比不为0
        
        # 将占比四舍五入为整数
        rounded_0 = round(label_0_proportion * 10)
        rounded_1 = round(label_1_proportion * 10)
        
        # 确保两个数的和为10，以保持和为1的比例
        if rounded_0 + rounded_1 != 10:
            if rounded_0 + rounded_1 > 10:
                diff = rounded_0 + rounded_1 - 10
                rounded_0 -= diff
                rounded_1 -= diff
            else:
                diff = 10 - (rounded_0 + rounded_1)
                rounded_0 += diff // 2
                rounded_1 += diff - (diff // 2)
        
        # 计算比例
        ratio = f"{rounded_0}:{rounded_1}"
        
        # 将结果添加到列表中
        results.append({
            '文件名': filename,
            '样本数': total_count,
            '标签0数量': label_counts.get(0, 0),
            '标签1数量': label_counts.get(1, 0),
            '标签0占比': f"{label_0_proportion:.2%}",
            '标签1占比': f"{label_1_proportion:.2%}",
            '样本0对1的比': ratio
        })

# 将结果列表转换为DataFrame
results_df = pd.DataFrame(results)

# 打印结果DataFrame
results_df

Unnamed: 0,文件名,样本数,标签0数量,标签1数量,标签0占比,标签1占比,样本0对1的比
0,Gm12878Sp1Pcr1xPkRep1.csv,84793,58672,26121,69.19%,30.81%,7:3
1,Gm12878Pou2f2Pcr1xPkRep1.csv,65895,42489,23406,64.48%,35.52%,6:4
2,K562Rad21V0416102PkRep1.csv,89695,53206,36489,59.32%,40.68%,6:4
3,Gm12878Bcl3V0416101PkRep1.csv,71851,49348,22503,68.68%,31.32%,7:3
4,K562Egr1V0416101PkRep1.csv,75844,37357,38487,49.26%,50.74%,5:5
5,K562E2f6sc22823V0416102PkRep1.csv,27659,13250,14409,47.90%,52.10%,5:5
6,H1hescTcf12Pcr1xPkRep1.csv,38324,25811,12513,67.35%,32.65%,7:3
7,H1hescGabpPcr1xPkRep1.csv,28611,11916,16695,41.65%,58.35%,4:6
8,Gm12878BatfPcr1xPkRep1.csv,92789,66685,26104,71.87%,28.13%,7:3
9,H1hescRad21V0416102PkRep1.csv,127760,67266,60494,52.65%,47.35%,5:5


In [5]:
import re

regex  = re.compile(r"^(Gm12878|K562|H1hesc)(.*?)(ak20|101388|sc(.*?)|Pcr(.*?)|V041(.*?))")
def get_str(str):
    res = regex.findall(str)
    return res

results_df['Cell Line'] = results_df['文件名'].apply(lambda x: get_str(x)[0][0])
results_df['TF'] = results_df['文件名'].apply(lambda x: get_str(x)[0][1])

In [9]:
order = ['Cell Line', 'TF', '样本数', '标签0数量', '标签1数量', '标签0占比','标签1占比', '样本0对1的比']
results_df = results_df[order]
results_df

Unnamed: 0,Cell Line,TF,样本数,标签0数量,标签1数量,标签0占比,标签1占比,样本0对1的比
0,Gm12878,Sp1,84793,58672,26121,69.19%,30.81%,7:3
1,Gm12878,Pou2f2,65895,42489,23406,64.48%,35.52%,6:4
2,K562,Rad21,89695,53206,36489,59.32%,40.68%,6:4
3,Gm12878,Bcl3,71851,49348,22503,68.68%,31.32%,7:3
4,K562,Egr1,75844,37357,38487,49.26%,50.74%,5:5
5,K562,E2f6,27659,13250,14409,47.90%,52.10%,5:5
6,H1hesc,Tcf12,38324,25811,12513,67.35%,32.65%,7:3
7,H1hesc,Gabp,28611,11916,16695,41.65%,58.35%,4:6
8,Gm12878,Batf,92789,66685,26104,71.87%,28.13%,7:3
9,H1hesc,Rad21,127760,67266,60494,52.65%,47.35%,5:5


In [10]:
new_column_names = {
    '样本数': 'Number of samples',
    '标签0数量': 'Number of label 0',
    '标签1数量': 'Number of label 1',
    '标签0占比': 'Ratio of label 0',
    '标签1占比': 'Ratio of label 1',
    '样本0对1的比': 'Ratio of sample 0 to sample 1'
}
# 使用rename方法替换列名
results_df.rename(columns=new_column_names, inplace=True)

# 打印修改后的DataFrame
results_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df.rename(columns=new_column_names, inplace=True)


Unnamed: 0,Cell Line,TF,Number of samples,Number of label 0,Number of label 1,Ratio of label 0,Ratio of label 1,Ratio of sample 0 to sample 1
0,Gm12878,Sp1,84793,58672,26121,69.19%,30.81%,7:3
1,Gm12878,Pou2f2,65895,42489,23406,64.48%,35.52%,6:4
2,K562,Rad21,89695,53206,36489,59.32%,40.68%,6:4
3,Gm12878,Bcl3,71851,49348,22503,68.68%,31.32%,7:3
4,K562,Egr1,75844,37357,38487,49.26%,50.74%,5:5
5,K562,E2f6,27659,13250,14409,47.90%,52.10%,5:5
6,H1hesc,Tcf12,38324,25811,12513,67.35%,32.65%,7:3
7,H1hesc,Gabp,28611,11916,16695,41.65%,58.35%,4:6
8,Gm12878,Batf,92789,66685,26104,71.87%,28.13%,7:3
9,H1hesc,Rad21,127760,67266,60494,52.65%,47.35%,5:5


In [12]:
results_df.sort_values(['Cell Line', 'TF'], inplace = True, ignore_index=True)
results_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df.sort_values(['Cell Line', 'TF'], inplace = True, ignore_index=True)


Unnamed: 0,Cell Line,TF,Number of samples,Number of label 0,Number of label 1,Ratio of label 0,Ratio of label 1,Ratio of sample 0 to sample 1
0,Gm12878,Batf,92789,66685,26104,71.87%,28.13%,7:3
1,Gm12878,Bcl11a,74418,52713,21705,70.83%,29.17%,7:3
2,Gm12878,Bcl3,71851,49348,22503,68.68%,31.32%,7:3
3,Gm12878,Bclaf,82059,52897,29162,64.46%,35.54%,6:4
4,Gm12878,Ebf,99600,59216,40384,59.45%,40.55%,6:4
5,Gm12878,Egr1,21095,10676,10419,50.61%,49.39%,5:5
6,Gm12878,Elf1,39864,21933,17931,55.02%,44.98%,6:4
7,Gm12878,Ets1,21685,14141,7544,65.21%,34.79%,7:3
8,Gm12878,Irf4,69246,47537,21709,68.65%,31.35%,7:3
9,Gm12878,Mef2a,69175,51164,18011,73.96%,26.04%,7:3


In [13]:
results_df.to_csv('../data/data_distribution.csv', encoding='utf-8', index=None)