The goal of this analysis is to design the proper training and testing sets for multiome and citeseq data. I would like data to be equaly distributed among days and donors.

In [2]:
import pandas as pd
import os
import numpy as np


data_path = "/home/skovtun/Single cell/Single_cell_data"
metadata = os.path.join(data_path,"metadata.csv")
pdata = pd.read_csv(metadata, index_col='cell_id')

In [6]:
multi = pdata[pdata['technology'] == 'multiome'].drop('technology', axis=1)

In [21]:
# Defining the two columns for which I need the closest matching distributions
data = multi
column1 = 'day'
column2 = 'donor'

unique_values_col1 = data[column1].nunique()
unique_values_col2 = data[column2].nunique()

# Defining the sample size
sample_size = data.shape[0]*0.8
train_sample = pd.DataFrame()

# Iterate over unique values in column1
for val_col1 in data[column1].unique():
    # For each unique value in column1, randomly select sample_size / unique_values_col1 rows
    sample_rows = data[data[column1] == val_col1].sample(int(sample_size / unique_values_col1), replace=False)
    
    # Append the sample to the final_sample DataFrame
    train_sample = pd.concat([train_sample, sample_rows])
# Defining test sample as the rest of the data
test_sample = data.drop(train_sample.index)

In [23]:
day_final_sample = train_sample.groupby(['day','donor']).aggregate({'count'})
day_final_sample.columns = ['cells_count']
day_final_sample = day_final_sample.reset_index()
day_final_sample['cells_by_day_%'] = day_final_sample['cells_count']/day_final_sample.groupby(['day'])['cells_count'].transform('sum')*100
# We see that day 10 is underrepresented
print("Percentage of cells from each donor for every day")
day_final_sample.pivot_table('cells_by_day_%',['donor'],'day')

Percentage of cells from each donor for every day


day,2,3,4,7,10
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13176,29.332046,24.366795,30.227799,25.563707,26.416988
27678,27.142857,21.76834,,24.648649,27.783784
31800,21.494208,30.791506,34.567568,26.11583,22.505792
32606,22.030888,23.073359,35.204633,23.671815,23.293436


In [24]:

day_remaining_data = test_sample.groupby(['day','donor']).aggregate({'count'})
day_remaining_data.columns = ['cells_count']
day_remaining_data = day_remaining_data.reset_index()
day_remaining_data['cells_by_day_%'] = day_remaining_data['cells_count']/day_remaining_data.groupby(['day'])['cells_count'].transform('sum')*100
# We see that day 10 is underrepresented
print("Percentage of cells from each donor for every day")
day_remaining_data.pivot_table('cells_by_day_%',['donor'],'day')


Percentage of cells from each donor for every day


day,2,3,4,7,10
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13176,28.289094,24.427059,30.149026,24.33055,26.885748
27678,27.279284,21.279337,,26.11575,27.141154
31800,21.956145,30.658076,35.173863,25.683847,22.390601
32606,22.475476,23.635527,34.677111,23.869853,23.582496


In [27]:
#Frequences look good, so we can define 2 lists of cell_id
multi_train_sample = list(train_sample.index)
multi_test_sample = list(test_sample.index)

In [28]:
#Doing the same for citeseq technology
cite = pdata[pdata['technology'] == 'citeseq'].drop('technology', axis=1)

In [32]:
# Defining the two columns for which I need the closest matching distributions
data = cite
column1 = 'day'
column2 = 'donor'

unique_values_col1 = data[column1].nunique()
unique_values_col2 = data[column2].nunique()

# Defining the sample size
sample_size = data.shape[0]*0.8
train_sample = pd.DataFrame()

# Iterate over unique values in column1
for val_col1 in data[column1].unique():
    # For each unique value in column1, randomly select sample_size / unique_values_col1 rows
    sample_rows = data[data[column1] == val_col1].sample(int(sample_size / unique_values_col1), replace=False)
    
    # Append the sample to the final_sample DataFrame
    train_sample = pd.concat([train_sample, sample_rows])
# Defining test sample as the rest of the data
test_sample = data.drop(train_sample.index)

In [33]:
day_final_sample = train_sample.groupby(['day','donor']).aggregate({'count'})
day_final_sample.columns = ['cells_count']
day_final_sample = day_final_sample.reset_index()
day_final_sample['cells_by_day_%'] = day_final_sample['cells_count']/day_final_sample.groupby(['day'])['cells_count'].transform('sum')*100
# We see that day 10 is underrepresented
print("Percentage of cells from each donor for every day")
day_final_sample.pivot_table('cells_by_day_%',['donor'],'day')

Percentage of cells from each donor for every day


day,2,3,4,7
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13176,20.593397,28.040117,23.64814,26.727957
27678,25.353113,23.781864,21.353949,23.217718
31800,28.72127,22.69954,28.608441,22.895947
32606,25.332219,25.478479,26.389469,27.158379


In [34]:
day_remaining_data = test_sample.groupby(['day','donor']).aggregate({'count'})
day_remaining_data.columns = ['cells_count']
day_remaining_data = day_remaining_data.reset_index()
day_remaining_data['cells_by_day_%'] = day_remaining_data['cells_count']/day_remaining_data.groupby(['day'])['cells_count'].transform('sum')*100
# We see that day 10 is underrepresented
print("Percentage of cells from each donor for every day")
day_remaining_data.pivot_table('cells_by_day_%',['donor'],'day')

Percentage of cells from each donor for every day


day,2,3,4,7
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13176,20.827259,26.973114,23.458122,27.204631
27678,25.674198,23.041341,22.594837,23.527409
31800,27.733236,23.908644,27.417614,23.561457
32606,25.765306,26.076901,26.529426,25.706503


In [35]:
#Frequences look good, so we can define 2 lists of cell_id
cite_train_sample = list(train_sample.index)
cite_test_sample = list(test_sample.index)

In [39]:
# writing all 4 lists to correspondent csv files.
import csv

with open('cite_train_sample.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(cite_train_sample)

with open('cite_test_sample.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(cite_test_sample)

with open('multi_train_sample.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(multi_train_sample)

with open('multi_test_sample.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(multi_test_sample)

In [43]:
len(multi_test_sample)

32378

In [42]:
with open('multi_test_sample.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        multi_test_sample.append(row)

In [44]:
multi_test_sample[:5]

['e0bc46450106',
 'f761aff20d94',
 '651d314a28d1',
 '0f5a5fbbd3cf',
 'f16910c43d66']