In [1]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
from scipy.stats import entropy
from scipy.stats import mannwhitneyu as mwu
import plotly.graph_objects as go

In [5]:
data_dir = '../Data/Clinical/'
data_dir2 = '../Result/'

# max_css = pd.read_csv( data_dir + "max_css_table.csv")
cl1 = pd.read_csv( data_dir + "clinical_data_1.csv", sep=',')
cl2 = pd.read_csv( data_dir + "clinical_data_2.csv", sep=',')
max_css = pd.read_csv(data_dir2 + "max_css_table.csv")

BCRHC01 = pd.read_csv(data_dir2 + 'aa_shannon_BCR_01_HC.csv')
BCRLC01 = pd.read_csv(data_dir2 + 'aa_shannon_BCR_01_LC.csv')
TCR01 = pd.read_csv(data_dir2 + 'aa_shannon_TCR_01.csv')
BCRHC02 = pd.read_csv(data_dir2 + 'aa_shannon_BCR_02_HC.csv')
BCRLC02 = pd.read_csv(data_dir2 + 'aa_shannon_BCR_02_LC.csv')
TCR02 = pd.read_csv(data_dir2 + 'aa_shannon_TCR_02.csv')

all = [BCRHC01, BCRLC01, TCR01, BCRHC02, BCRLC02, TCR02]
only01 = [BCRHC01, BCRLC01, TCR01]
only02 = [BCRHC02, BCRLC02, TCR02] 

BCRHC01.loc[BCRHC01['Time_point'] == 0, 'Time_point'] = 1

sev_dict = max_css['severity'].value_counts().to_dict()

print(sev_dict)
print(max_css.head())
print(BCRHC01.head())

{1: 204, 2: 165, 3: 52, 4: 36, 5: 2}
            ID  severity
0  COV-CCO-001         2
1  COV-CCO-002         2
2  COV-CCO-003         2
3  COV-CCO-004         2
4  COV-CCO-006         1
             Sample  Row_number  Total_readcount  Shannon_diversity  \
0  COV-CCO-0411_IGG        2498            19342           6.727983   
1  COV-CCO-0411_IGM        7006            11251           8.179853   
2  COV-CCO-0412_IGG         348             1523           5.104862   
3  COV-CCO-0412_IGM        1183             1850           6.769221   
4  COV-CCO-0413_IGG        2743            12102           6.672048   

       PlnP2  Squared_Shannon  Time_point Type  
0  47.012921        45.265755           1  IGG  
1  69.480620        66.909994           1  IGM  
2  27.682505        26.059611           2  IGG  
3  46.744685        45.822355           2  IGM  
4  47.145421        44.516224           3  IGG  


  cl2 = pd.read_csv( data_dir + "clinical_data_2.csv", sep=',')


In [8]:
out_dir = '../Result/'

adaptome = pd.read_csv(out_dir + 'max_css_adaptome_only.csv')
adaptome.head()

Unnamed: 0,ID,severity,from
0,COV-CCO-041,1,1
1,COV-CCO-042,1,1
2,COV-CCO-043,1,1
3,COV-CCO-044,1,1
4,COV-CCO-045,2,1


In [12]:
### sevcl1과 sevcl2은 각각 cl1과 cl2 중에서 adaptome에 있는 환자들만 추출한 데이터프레임

sevcl1 = cl1[cl1['ID'].isin(adaptome['ID'])]
sevcl2 = cl2[cl2['ID'].isin(adaptome['ID'])]

In [13]:
# YES ==1, NO == 2, NaN == 99999

# [1차 수집]
# CMD_AVIRUS__1   항바이러스제 복용 여부
# CMD_AVIRUSR__1  렘데시비르 복용 여부

# M1_AVIRUS       내원시 항바이러스제 복용 여부
# M1_AVIRUSR      내원시 렘데시비르 복용 여부
# M2_AVIRUS       항바이러스제 치료 도입 여부
# M2_AVIRUSR      렘데시비르 치료 도입 여부

class allocate_avirus:
    def __init__(self, df1, df2, adaptome):
        self.df1 = df1
        self.df2 = df2
        self.adaptome = adaptome

    def allocate(self):
        self.adaptome['avirus'] = 0
        self.adaptome['avirusr'] = 0

        for i in range(len(self.df1)):
            if 1 in self.df1.iloc[i][['CMD_AVIRUS__1']].values:
                if np.isin(self.adaptome['ID'].values, self.df1.iloc[i]['ID']).any():
                    self.adaptome.loc[i, 'avirus'] = 1

            if 1 in self.df1.iloc[i][['CMD_AVIRUSR__1']].values:
                if np.isin(self.adaptome['ID'].values, self.df1.iloc[i]['ID']).any():
                    self.adaptome.loc[i, 'avirusr'] = 1
                    
        for i in range(len(self.df2)):
            if 1 in self.df2.iloc[i][['M1_AVIRUS','M2_AVIRUS']].values:
                if np.isin(self.adaptome['ID'].values, self.df2.iloc[i]['ID']).any():
                    self.adaptome.loc[i, 'avirus'] = 1

            if 1 in self.df2.iloc[i][['M1_AVIRUSR','M2_AVIRUSR']].values:
                if np.isin(self.adaptome['ID'].values, self.df2.iloc[i]['ID']).any():
                    self.adaptome.loc[i, 'avirusr'] = 1
    
        return self.adaptome
    
adaptome = allocate_avirus(sevcl1, sevcl2, adaptome).allocate()

### adaptome에서 ID가 NaN인 행 제거
adaptome = adaptome.dropna(subset=['ID'])
print(adaptome['avirus'].value_counts(), adaptome['avirusr'].value_counts(), sep='\n')

avirus
0.0    134
1.0    131
Name: count, dtype: int64
avirusr
0.0    134
1.0    131
Name: count, dtype: int64


In [23]:
### adaptome에서 avirus = 1이면서 from = 1인 행만 print

print(adaptome[(adaptome['avirus'] == 1) & (adaptome['from'] == 1)].shape)
print(adaptome[(adaptome['avirus'] == 1) & (adaptome['from'] == 2)].shape)

(111, 5)
(20, 5)
