# Preparing the data for the 0.8 clustering

In [2]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

cluster_info = pd.read_csv("data/RuBisCO.300-700.faa.sorted.0.8.uc", sep="\t", header=None).iloc[:, [0, 1, 2, 3, 8]]
cluster_info.columns = ["membership", "cluster", "length", "pidentity", "sequence"]
cluster_info = cluster_info.loc[cluster_info.membership=="C", :]
cluster_info.set_index(["cluster"], inplace=True)
cluster_info.drop(["membership", "pidentity"], axis=1, inplace=True)
cluster_info.columns = ["size", "seed"]

cluster_info

Unnamed: 0_level_0,size,seed
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45,AAQ04822.2
1,131,WP_106857372.1
2,535,RWP03589.1
3,41106,QHN53519.1
4,4525,BBE28301.1
...,...,...
2250,2,TARA_068.SAMEA2621061.50.0-0.22_714_15
2251,1,VDD88771.1
2252,1,TARA_125.SAMEA2622837.140.0.22-3_137106_7
2253,1,TARA_038.SAMEA2620014.5.0-0.22_8457_6


In [5]:
sequence_info = pd.read_csv("final/RuBisCO.300-700.faa.csv")
sequence_info.set_index(["sequence"], inplace=True)
sequence_info.drop(["membership", "cluster", "pidentity", "annotation_preliminary"], axis=1, inplace=True)
sequence_info

Unnamed: 0_level_0,length,annotation
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1
AAQ04822.2,538,II
WP_106857372.1,497,I-CD2
RWP03589.1,497,I-CD2
QHN53519.1,495,I-CD1
YP_009729450.1,492,I-CD1
...,...,...
TARA_124.SAMEA2622801.120.0.45-0.8_267714_17,305,IV-GOS
ABP82249.1,304,I-CD1
CDI73530.1,303,I-CD2
AOO78264.1,302,I-CD1


In [12]:
cluster_info["annotation"] = list(sequence_info.loc[cluster_info["seed"], "annotation"])
cluster_info

Unnamed: 0_level_0,size,seed,annotation
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,45,AAQ04822.2,II
1,131,WP_106857372.1,I-CD2
2,535,RWP03589.1,I-CD2
3,41106,QHN53519.1,I-CD1
4,4525,BBE28301.1,I-CD2
...,...,...,...
2250,2,TARA_068.SAMEA2621061.50.0-0.22_714_15,IV-GOS
2251,1,VDD88771.1,III
2252,1,TARA_125.SAMEA2622837.140.0.22-3_137106_7,IV-Unknown
2253,1,TARA_038.SAMEA2620014.5.0-0.22_8457_6,IV-GOS


In [22]:
import os
from Bio.SeqRecord import SeqRecord

cluster_info["tree_name"] = "None"
#os.mkdir("for-raxml-tree")

reps = []
type2count = {}
cluster_info["tree_name"] = "Unnamed"
for record in SeqIO.parse("data/RuBisCO.300-700.faa", 'fasta'):
    if len(cluster_info.loc[cluster_info["seed"] == record.id, :]) == 0:
        continue
    cluster = cluster_info.loc[cluster_info["seed"] == record.id, :].index[0]
    cluster_type = cluster_info.loc[cluster, "annotation"]
    type2count[cluster_type] = type2count.get(cluster_type, 0) + 1
    new_record = SeqRecord(id = f"{cluster_type}.{type2count[cluster_type]}", seq = record.seq, description = "")
    reps.append(new_record)
    cluster_info.loc[cluster, "tree_name"] = f"{cluster_type}.{type2count[cluster_type]}"

SeqIO.write(reps, "for-raxml-tree/RuBisCO.300-700.0.8.faa", "fasta")
cluster_info.to_csv("for-raxml-tree/RuBisCO.300-700.0.8.faa.csv")

ACDUNK_771_4
651
cg2_3.0_scaffold_1100_c_10
1123
cg2_3.0_scaffold_5338_c_6
1083
cg2_3.0_scaffold_6845_c_6
1103
GWA2_Elusimicrobia_66_18_gwa2_scaffold_980_18
1085
GWC2_Chloroflexi_73_18_gwc2_scaffold_12078_2
2155
GWC2_Gemmamonadetes_71_9_gwc2_scaffold_9002_13
2238
RBG_16_Elusimicrobia_66_12_RBG_16_scaffold_20757_18
1174
RBG_16_Planctomycetes_64_12_RBG_16_scaffold_63101_5
1551
RIFCSPLOWO2_02_FULL_Gammaproteobacteria_56_15_rifcsplowo2_02_scaffold_79926_1
1234
gwf2_scaffold_2732_6
1612
cg2_3.0_scaffold_1065_c_17
471
cg1_0.2_scaffold_107_c_58
1038
gwa2_scaffold_913_23
1089
cg1_0.2_scaffold_958_c_17
921
RBG_16_Archaea_50_20_RBG_16_scaffold_16389_9
513
RBG_16_Archaea_36_9_RBG_16_scaffold_36_87
2045
RBG_13_Archaea_46_16b_RBG_13_scaffold_4070_13
1998
cg1_0.2_scaffold_115_c_6
968
cg1_0.2_scaffold_3774_c_7
1515
cg_0.2_sub10_scaffold_1045_c_18
210
RBG_16_Anaeromyxobacter_69_14_RBG_16_scaffold_568_150
69
RBG_16_CP_70_13_RBG_16_scaffold_22096_8
360
RIFCSPHIGHO2_02_FULL_OP11_39_11_rifcsphigho2_02_sca

MBD3190074.1
1675
WP_168371275.1
1708
HDM91862.1
670
PKG32658.1
555
PIO05869.1
1167
RLG77261.1
1396
WP_013866882.1
1936
VDS11145.1
188
NQT49111.1
1545
HET20929.1
935
HGS80205.1
807
VDD88909.1
2159
KXA91689.1
596
KUJ93020.1
169
VDD88905.1
2088
NMB48515.1
2078
HEQ65200.1
1069
VDD89157.1
578
HIH32861.1
354
VDD89146.1
954
WP_167731531.1
168
VVB75776.1
2184
VDD88910.1
2028
VDD89243.1
63
VDD88844.1
904
HFT81502.1
1544
HEF87248.1
286
VDD88774.1
1127
NVM02173.1
2154
KHO50554.1
1142
HGQ79017.1
511
PIU62916.1
1128
VDD89250.1
1763
VDD88898.1
1750
TXT59762.1
255
VDD89183.1
638
HHQ45440.1
1647
VDD88765.1
760
WP_088885583.1
770
HGN93567.1
2090
RLI07641.1
1649
PUA34057.1
936
RLF68008.1
1501
VDD88753.1
243
OQX71398.1
2097
HHP05817.1
802
OGF34049.1
812
VDD88855.1
738
VDD89089.1
1845
NHW23919.1
27
HGN17772.1
402
MBD3285174.1
946
NMA44343.1
678
VDD89255.1
472
WP_088865093.1
21
VDD89200.1
816
VDD89029.1
1413
HID05175.1
1392
NHW88692.1
252
MBD3197374.1
1909
TSC54433.1
1793
VDD89095.1
1903
VDD88994.1
2209
H

OIQ39221.1
1110
WP_007068008.1
1186
WP_018717736.1
1048
WP_144478240.1
1090
BAC16243.2
386
AAC02930.1
166
RLB33671.1
1524
WP_150092840.1
1366
WP_161407444.1
223
NBK70185.1
1161
WP_114793339.1
870
NVM28570.1
326
WP_139664886.1
2041
WP_116323962.1
473
HFU29422.1
1418
WP_148022396.1
62
NLW67049.1
1045
MBU70118.1
751
WP_188757613.1
258
WP_185133818.1
358
KAF7800411.1
2037
WP_169128432.1
2165
PHT98747.1
1829
KYK29529.1
1014
QPC42817.1
1966
WP_132319315.1
1529
WP_144811339.1
36
WP_085019909.1
915
WP_120110234.1
591
WP_062766009.1
2193
TFF94994.1
1575
WP_104520621.1
780
1YKW_A
79
WP_109973062.1
707
WP_094049502.1
625
TVR70006.1
144
MBA3480117.1
1365
WP_128563219.1
375
WP_072336715.1
2099
WP_100133425.1
1282
WP_135161662.1
450
WP_184168458.1
142
WP_046078050.1
261
PIR25431.1
966
NYT23721.1
916
NYT80693.1
37
EMS72223.1
1949
WP_052010453.1
747
CAB1078435.1
1611
WP_019937668.1
2168
WP_088956933.1
2248
WP_012500248.1
2007
WP_090309406.1
2115
NLA41931.1
75
NMC80470.1
2050
HIG53303.1
598
HGU06449.1


WP_019941043.1
264
WP_067756147.1
1948
WP_108503113.1
380
WP_131196464.1
941
MBC8258666.1
400
WP_142823731.1
975
QHQ36049.1
2076
VDS11137.1
1621
HHS94388.1
165
WP_145863085.1
84
WP_188215660.1
517
WP_158995115.1
1187
WP_183781633.1
44
WP_153713041.1
1185
WP_127335966.1
1716
WP_114771920.1
1920
PWH13205.1
1994
MBB5790147.1
222
PKQ23435.1
74
WP_068559387.1
898
WP_158816469.1
368
MBE1608394.1
1598
NVK35207.1
1844
WP_091532692.1
303
WP_163336241.1
1859
WP_051103939.1
1231
WP_185661935.1
1075
OBX37869.1
579
TWT91049.1
1856
QDV34725.1
1802
WP_182582143.1
1345
WP_094048765.1
33
TVQ84370.1
582
WP_110032519.1
35
WP_092846784.1
2247
WP_062421145.1
2195
MBE2213659.1
890
RVT86668.1
1565
WP_061167361.1
48
WP_041154068.1
106
WP_073611641.1
1610
OYU50427.1
1548
WP_021818325.1
560
NBX60943.1
528
HCF49330.1
1061
NBR33864.1
782
WP_029010986.1
776
WP_091548945.1
783
WP_108886102.1
1021
WP_132950164.1
17
WP_140895569.1
100
WP_188701797.1
1018
WP_091844669.1
755
AFY57639.1
1991
WP_111341262.1
2149
NHJ02601

MBC5809956.1
1910
HGO60354.1
794
WP_092786836.1
1590
NIM52323.1
691
TVR32154.1
1098
WP_070113114.1
1505
WP_166936767.1
1252
MBC7634588.1
2126
WP_146953894.1
763
WP_114983720.1
1602
ASY11078.1
392
QIT55953.1
1375
PHT97483.1
771
KXK02801.1
1265
MSQ68069.1
240
KAA3627279.1
1224
WP_192533091.1
1943
WP_106572864.1
2083
WP_111385924.1
929
TVR79267.1
1665
SED07816.1
1228
NLD40919.1
1984
WP_128515260.1
736
PYP20034.1
2030
TAN55019.1
1929
RMH06091.1
1448
WP_156004867.1
1728
WP_021098855.1
274
KAF0110453.1
1748
TLY30118.1
1057
WP_066175728.1
1432
MSO43619.1
1928
WP_174511559.1
239
WP_114649166.1
880
MBA5874654.1
1276
WP_138431538.1
1851
HHW82582.1
1664
WP_150041624.1
1411
TVR34575.1
459
OYV36618.1
1170
TMH68537.1
238
MSP03938.1
1125
WP_040770881.1
1005
WP_123401205.1
1870
RKE66117.1
1191
ODS59240.1
2217
MSV91752.1
116
PIE90237.1
608
WP_049725484.1
1062
NGZ96093.1
1626
WP_136384946.1
1268
ANX05372.1
766
OIT02594.1
1101
WP_108850143.1
1404
TKB89096.1
388
NLN11867.1
1535
WP_011628912.1
1732
MSW7586

1168
TARA_141.SAMEA2623446.5.0.22-3_596447_8
1995
TARA_025.SAMEA2619779.5.0-0.22_12401_3
1774
TARA_125.SAMEA2622821.5.0.1-0.22_697_47
320
TARA_123.SAMEA2622736.150.0.1-0.22_389732_5
874
TARA_034.SAMEA2619923.60.0-0.22_119_14
1309
TARA_007.SAMEA2591074.42.0.22-1.6_417480_1
762
TARA_070.SAMEA2621085.5.0-0.22_402_18
715
TARA_152.SAMEA2623907.800.0.22-3_2673518_6
66
TARA_004.SAMEA2619376.5.0.22-1.6_5224_19
659
TARA_037.SAMEA2619974.600.0.1-0.22_25679_4
567
TARA_093.SAMEA2621812.35.0.22-3_645542_6
1281
TARA_056.SAMEA2620651.5.0.22-3_897156_22
465
TARA_122.SAMEA2622657.5.0.22-0.45_1220637_14
571
TARA_030.SAMEA2591122.70.0.22-1.6_1823072_3
660
TARA_109.SAMEA2622316.5.0.22-3_2079919_8
119
TARA_034.SAMEA2619907.60.0.22-1.6_1013533_52
1773
TARA_152.SAMEA2623886.5.0.22-3_2580620_19
2084
TARA_052.SAMEA2620542.5.0.22-1.6_449877_6
353
TARA_145.SAMEA2623627.5.0.22-3_1990852_5
2219
TARA_025.SAMEA2619779.5.0-0.22_8850_2
576
TARA_066.SAMEA2620967.30.0-0.22_7541_184
394


In [25]:
print(cluster_info["annotation"].value_counts())

III            496
IV-NonPhoto    465
IV-Deep-Ykr    411
IV-Ykr-2       129
Unknown        104
IV-GOS          89
IV-NG3          82
IV-Photo        75
I-CD2           71
II              57
I-CD1           51
IV-NG2          47
IV-NG1          43
II-III          35
IV-Ykr          33
IV-Unknown      26
I-A             25
I-Banda          8
I-NG1            7
I-B              1
Name: annotation, dtype: int64
