In [1]:
"""
Python3.9.3
OS: Debian-buster

[POINT]
・Donarをキーに各CSVから読み込んだDataFrameをmergeする。


jupyter nbconvert --execute --to html DataTable.ipynb
"""

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cell_data_path = "cell_data.csv"

In [3]:
df_cell = pd.read_csv(cell_data_path)

In [4]:
df_cell.head()

Unnamed: 0,UID,X1,X2,CellType,Donor
0,4861STDY7462257-AAACCTGCATGGGAAC,6.308161,-3.586926,T,A16
1,4861STDY7462257-AAAGATGGTGTGACGA,2.006791,-7.994239,T,A16
2,4861STDY7462257-AACTCCCGTCCAAGTT,-6.246193,-6.825125,T,A16
3,4861STDY7462257-AAGACCTCACGAGAGT,2.936845,-8.620646,T,A16
4,4861STDY7462257-AAGCCGCTCTAACTTC,-5.834578,0.225145,T,A16


In [5]:
df_cell.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5938 entries, 0 to 5937
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UID       5938 non-null   object 
 1   X1        5938 non-null   float64
 2   X2        5938 non-null   float64
 3   CellType  5938 non-null   object 
 4   Donor     5938 non-null   object 
dtypes: float64(2), object(3)
memory usage: 232.1+ KB


In [6]:
df_cell.isnull().sum()

UID         0
X1          0
X2          0
CellType    0
Donor       0
dtype: int64

In [7]:
donorpath = "donor_info.csv"
df_donor = pd.read_csv(donorpath)


In [8]:
df_donor.head()

Unnamed: 0,Donor,Age,Gender
0,F21,0.31,Male
1,F22,0.17,Female
2,F23,0.21,Male
3,F29,0.33,Female
4,F30,0.27,Male


In [9]:
#比べたいもの
# 男性と女性でのT細胞の比率の差

In [10]:
df_donor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Donor   23 non-null     object 
 1   Age     23 non-null     float64
 2   Gender  23 non-null     object 
dtypes: float64(1), object(2)
memory usage: 680.0+ bytes


In [11]:
df_donor["Donor"].unique()

array(['F21', 'F22', 'F23', 'F29', 'F30', 'F38', 'F41', 'F45', 'A16',
       'P1', 'P2', 'P3', 'F64', 'F67', 'C34', 'T03', 'T06', 'T07', 'F83',
       'C40', 'C41', 'A43', 'F74'], dtype=object)

In [12]:
df_cell["Donor"].unique()

array(['A16', 'C34', 'F22', 'F64', 'F67', 'F74', 'T03', 'T06', 'T07',
       'A43', 'C40', 'C41'], dtype=object)

In [13]:
# df_donor["Donor"]とdf_cell["Donor"]の値は完全一致しているわけではない

In [14]:
#Donorを軸にmergeする
df = pd.merge(df_cell, df_donor, how="right", on="Donor")
df.head()

Unnamed: 0,UID,X1,X2,CellType,Donor,Age,Gender
0,,,,,F21,0.31,Male
1,FCAImmP7198432-AAAGCAATCTACTATC,-5.097418,0.803003,T,F22,0.17,Female
2,FCAImmP7198432-AACCGCGGTTGTGGAG,-3.488387,0.291474,T,F22,0.17,Female
3,FCAImmP7198432-AAGACCTGTAAGGATT,-6.490744,-4.09797,T,F22,0.17,Female
4,FCAImmP7198432-ACACTGACATATACGC,-4.816401,-5.631334,T,F22,0.17,Female


In [15]:
#Nanの入った行を全て削除する予定

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5949 entries, 0 to 5948
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UID       5938 non-null   object 
 1   X1        5938 non-null   float64
 2   X2        5938 non-null   float64
 3   CellType  5938 non-null   object 
 4   Donor     5949 non-null   object 
 5   Age       5949 non-null   float64
 6   Gender    5949 non-null   object 
dtypes: float64(3), object(4)
memory usage: 371.8+ KB


In [17]:
#各columnのnullの確認
df.isnull().sum()

UID         11
X1          11
X2          11
CellType    11
Donor        0
Age          0
Gender       0
dtype: int64

In [18]:
#Nanの入った行を全て削除する
df.dropna(how="any")

Unnamed: 0,UID,X1,X2,CellType,Donor,Age,Gender
1,FCAImmP7198432-AAAGCAATCTACTATC,-5.097418,0.803003,T,F22,0.17,Female
2,FCAImmP7198432-AACCGCGGTTGTGGAG,-3.488387,0.291474,T,F22,0.17,Female
3,FCAImmP7198432-AAGACCTGTAAGGATT,-6.490744,-4.097970,T,F22,0.17,Female
4,FCAImmP7198432-ACACTGACATATACGC,-4.816401,-5.631334,T,F22,0.17,Female
5,FCAImmP7198432-ACAGCTATCTACTATC,-7.179873,-2.754704,T,F22,0.17,Female
...,...,...,...,...,...,...,...
5944,Human_colon_16S7985397-TGCGGGTTCGGTCTAA,-7.133275,9.076816,T,F74,0.19,Male
5945,Human_colon_16S7985397-TGGCGCACACAGCCCA,-4.059606,-5.125242,T,F74,0.19,Male
5946,Human_colon_16S7985397-TGGTTCCGTAGCCTCG,-0.004667,-5.592957,T,F74,0.19,Male
5947,Human_colon_16S7985397-TGTGGTACATCAGTAC,-6.294744,9.659031,T,F74,0.19,Male


In [19]:
df = df.dropna(how="any")

In [20]:
df[["CellType", "Gender"]].value_counts()

CellType  Gender
B         Male      1797
T         Male      1577
          Female    1402
B         Female    1162
dtype: int64

In [21]:
count = df[["CellType", "Gender"]].value_counts()

In [22]:
for i, v in count.items():
    print(i, " ", v)

('B', 'Male')   1797
('T', 'Male')   1577
('T', 'Female')   1402
('B', 'Female')   1162


In [32]:
df_table = pd.DataFrame([[0,0], [0,0]], index=["B", "T"], columns=["Male", "Female"]) #init



In [33]:
#テーブルに格納
for l, v in count.items():
    df_table.loc[l[0], l[1]] = v

In [34]:
df_table

Unnamed: 0,Male,Female
B,1797,1162
T,1577,1402


In [28]:
"""

カイ二乗検定
# ある生物学者が、男性と女性のドナーではT細胞とB細胞の比率に違いがあると主張しています。
# 帰無仮説: 違い無し
# 対立仮説: 違いあり

有意水準5%で検定をおこなう
"""

'\n\n母比率の差の検定\n# ある生物学者が、男性と女性のドナーではT細胞とB細胞の比率に違いがあると主張しています。\n# 帰無仮説: 違い無し\n# 対立仮説: 違いあり\n\n有意水準5%で検定をおこなう\n'

In [37]:
import scipy as sp
import scipy.stats

x2, p, dof, expected = sp.stats.chi2_contingency(df_table)

print("カイ二乗値は %(x2)s" %locals() )
print("確率は %(p)s" %locals() )
print("自由度は %(dof)s" %locals() )
print( expected )

if p < 0.05:
    print("有意な差があります")
else:
    print("有意な差がありません")

カイ二乗値は 36.426010086753955
確率は 1.5857290680554595e-09
自由度は 1
[[1681.31795217 1277.68204783]
 [1692.68204783 1286.31795217]]
有意な差があります


In [29]:
#よって生物学者の主張が通る