## Purpose
1. To remove rows where both RefSeq ID = "-" and GENECODEID = "-" 
2. To remove all rows that contain hypothetical
3. To remove all other theoretical | outlaws
3. To create a new file that satisfy both #1 and #2

In [18]:
import pandas as pd

In [19]:
f_iso_all = "iso_all_v1.1.csv"
df_iso_all = pd.read_csv(f_iso_all)
df_iso_all

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194775,hypothetical.2081.1,-,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,-,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,-,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,-,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [20]:
# Check that all CHESS ID are unique
df_iso_all["CHESS ID"].value_counts()  

CHS.12322.8       1
CHS.17999.8       1
CHS.37787.12      1
CHS.17009.alt8    1
CHS.49846.13      1
                 ..
CHS.41916.10      1
CHS.20181.16      1
CHS.40697.alt7    1
CHS.24311.12      1
CHS.26516.3       1
Name: CHESS ID, Length: 194780, dtype: int64

In [21]:
 # Getting to know the data
df_iso_all["GENE"].value_counts() 

-            26704
MAPK10         129
ABI2           117
MBNL1          109
RAP1GAP        106
             ...  
KRTAP10-2        1
OR10Q1           1
EN2              1
KRTAP20-2        1
CT47A7           1
Name: GENE, Length: 16124, dtype: int64

### 1. Keep only rows where there is transcript info in at least one of RefSeq and GENCODE

In [22]:
# Show rows where RefSeq ID exist (not "-")
df_iso_all[df_iso_all["RefSeq ID"] != "-"].sample(20)

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
20813,CHS.1359.alt16,-,rna-XR_002959757.1,-,68.9,False
121962,CHS.35518.8,HSCB,rna-NM_001318314.2,-,80.1,True
870,CHS.2830.2,POLR3C,rna-NM_006468.8,ENST00000334163.4,90.2,True
39929,CHS.7576.3,STK32C,rna-NM_001318879.2,ENST00000368622.5,83.2,False
162546,CHS.52159.2,EZH2,rna-NM_001203249.2,ENST00000476773.5,69.9,True
10819,CHS.40946.1,BMPR1B,rna-NM_001203.3,ENST00000515059.6,83.0,True
116672,CHS.33373.5,GGT7,rna-XM_011528783.3,-,92.6,True
117650,CHS.33648.25,SNX21,rna-XM_005260608.4,-,78.9,True
131541,CHS.38455.2,SLC35G2,rna-NM_001097599.2,-,76.3,True
47335,CHS.9644.7,CREBZF,rna-NR_165399.1,ENST00000528561.5,59.2,True


In [23]:
# Show rows where GENECODE ID exist (not "-")
df_iso_all[df_iso_all["GENCODE ID"] != "-"].sample(20)

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
41677,CHS.8070.18,-,-,ENST00000529587.1,77.8,False
12603,CHS.50139.1,C7orf31,rna-NM_138811.4,ENST00000283905.8,54.6,False
132493,CHS.38760.alt1,GMPS,-,ENST00000295920.7,87.7,True
176965,CHS.57958.alt3,PORCN,-,ENST00000683923.1,89.6,False
126075,CHS.36828.15,AZI2,-,ENST00000476174.2,63.7,True
131033,CHS.38314.alt8,ACAD9,-,ENST00000679431.1,53.9,False
127455,CHS.37191.4,CSPG5,rna-NM_001206944.2,ENST00000610462.1,47.2,True
140587,CHS.41857.24,GPM6A,-,ENST00000507080.5,84.9,True
145708,CHS.44141.3,PCDHA11,-,ENST00000617408.1,43.3,True
20841,CHS.1360.18,SLFNL1,rna-NM_001377532.1,ENST00000359345.5,73.1,True


In [24]:
# Count the number of rows where both RefSeq and GENCODE ID are "-"
df_iso_all[(df_iso_all["RefSeq ID"] == "-") & (df_iso_all["GENCODE ID"] == "-")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
16222,CHS.10.alt1,-,-,-,61.1,False
16223,CHS.10.alt2,-,-,-,66.8,False
16224,CHS.10.alt3,-,-,-,66.8,False
16225,CHS.10.alt4,-,-,-,66.8,False
16226,CHS.10.alt5,-,-,-,66.8,False
...,...,...,...,...,...,...
194448,hypothetical.1794.1,CLIC2,-,-,85.9,False
194449,hypothetical.1794.2,CLIC2,-,-,43.2,False
194452,hypothetical.1796.1,-,-,-,50.8,False
194453,hypothetical.1796.2,-,-,-,49.9,False


In [25]:
df_iso_all_RGExist = df_iso_all[(df_iso_all["RefSeq ID"] != "-") | (df_iso_all["GENCODE ID"] != "-")]
df_iso_all_RGExist

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194775,hypothetical.2081.1,-,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,-,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,-,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,-,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [26]:
# Check whether there are any rows containing "-" in both "RefSeq ID" and "GENCODE ID" cols
df_iso_all_RGExist[(df_iso_all_RGExist["RefSeq ID"] == "-") & (df_iso_all_RGExist["GENCODE ID"] == "-")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse


In [27]:
print((58686+136094) == 194780)

True


### 2. Remove all hypothetical

In [28]:
df_iso_all_RGExist

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194775,hypothetical.2081.1,-,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,-,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,-,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,-,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [29]:
# Check the number of CHESS ID containing "hypothetical"
df_iso_all_RGExist[df_iso_all_RGExist["CHESS ID"].str.contains("hypothetical")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
141,hypothetical.0.1,PRAMEF27,rna-NM_001300891.2,ENST00000436041.6,84.7,False
143,hypothetical.1.1,PRAMEF26,rna-NM_001306072.3,ENST00000624207.1,84.7,False
144,hypothetical.2.1,HNRNPCL4,rna-NM_001302551.2,ENST00000323770.8,65.2,no_introns
145,hypothetical.3.1,PRAMEF9,rna-NM_001010890.3,ENST00000415919.3,84.8,False
207,hypothetical.4.1,PLA2G2C,rna-NM_001367969.2,ENST00000679259.1,71.8,False
...,...,...,...,...,...,...
194775,hypothetical.2081.1,-,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,-,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,-,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,-,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [30]:
# Create a df where, from the output df from step 1, rows where CHESS ID column contains "hypothetical" are removed
df_iso_all_filtered = df_iso_all_RGExist[~df_iso_all_RGExist["CHESS ID"].str.contains("hypothetical")]
df_iso_all_filtered

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194662,CHS.34304.1,-,rna-NM_001386820.1,ENST00000624951.1,52.2,True
194664,CHS.613.1,-,rna-NM_152232.4,ENST00000375371.3,86.0,True
194665,CHS.10998.1,-,rna-NM_023922.1,-,82.1,no_introns
194666,CHS.11009.1,-,rna-NM_001097643.1,-,84.7,no_introns


In [34]:
df_iso_all_RGExist[df_iso_all_RGExist["GENE"] == "-"]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
488,CHS.1462.2,-,rna-NM_001377534.1,ENST00000339355.3,70.7,False
894,CHS.2950.1,-,rna-NM_003516.3,ENST00000369159.2,89.6,no_introns
1029,CHS.3243.alt1,-,rna-NM_001394530.1,ENST00000392487.1,66.4,no_introns
1079,CHS.3339.2,-,rna-NM_182679.3,ENST00000368232.9,56.9,True
1483,CHS.4805.2,-,rna-NM_033445.3,-,89.3,no_introns
...,...,...,...,...,...,...
194775,hypothetical.2081.1,-,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,-,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,-,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,-,rna-NM_001005468.2,ENST00000641451.2,89.3,True


### Count genes without MANE transcript

In [53]:
df_iso_all_filtered["GENE"].value_counts()

-          14706
MAPK10       104
ABI2         103
MBNL1        102
CELF1         93
           ...  
PPIAL4C        1
NGRN           1
FAM162B        1
TCERG1L        1
PDRG1          1
Name: GENE, Length: 15852, dtype: int64

In [54]:
unique_genes_iso_all = list(df_iso_all_filtered["GENE"].unique())
unique_genes_iso_all.remove("-")
print(len(unique_genes_iso_all))

15851


In [55]:
# Obtaining value for MANE (after processing, MANE transcripts added as rows)
f_MANE_cim = "iso_MANE_cim_fixed.csv"
df_MANE_cim = pd.read_csv(f_MANE_cim)
df_MANE_cim_filtered = df_MANE_cim[~df_MANE_cim["CHESS_ID"].str.contains("hypothetical")]
df_MANE_cim_filtered

Unnamed: 0,CHESS_ID,CHESS_ID_MANE,gene,aa_length_isoform,aa_length_MANE,length_ratio,pLDDT_isoform,pLDDT_MANE,pLDDT_ratio,GTEx_samples_observed_isoform,GTEx_samples_observed_MANE,GTEx_top_tissue_name_isoform,GTEx_top_tissue_name_MANE,GTEx_top_tissue_TPM_isoform,GTEx_top_tissue_TPM_MANE,introns_conserved_in_mouse_isoform,introns_conserved_in_mouse_MANE
0,CHS.59119.2,CHS.59119.2,CLIC2,247,247,1.000000,92.6,92.6,1.000000,9405.0,9405.0,Bone_Marrow,Bone_Marrow,63.284808,63.284808,False,False
1,CHS.59109.2,CHS.59109.2,SMIM9,99,99,1.000000,66.9,66.9,1.000000,2.0,2.0,Testis,Testis,1.436480,1.436480,False,False
2,CHS.59041.5,CHS.59041.5,PNCK,343,343,1.000000,80.7,80.7,1.000000,5474.0,5474.0,Colon,Colon,112.891169,112.891169,True,True
4,CHS.58995.1,CHS.58995.1,MAGEA10,369,369,1.000000,72.3,72.3,1.000000,181.0,181.0,Testis,Testis,1.967843,1.967843,False,False
6,CHS.58681.7,CHS.58681.7,TMEM255A,325,325,1.000000,63.5,63.5,1.000000,3788.0,3788.0,Ovary,Ovary,14.303998,14.303998,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107506,CHS.59041.26,CHS.59041.5,PNCK,92,343,0.268222,91.7,80.7,1.136307,70.0,5474.0,Nerve,Colon,2.181060,112.891169,True,True
107507,CHS.59041.28,CHS.59041.5,PNCK,119,343,0.346939,71.4,80.7,0.884758,90.0,5474.0,Blood_Vessel,Colon,2.043669,112.891169,True,True
107508,CHS.59041.27,CHS.59041.5,PNCK,86,343,0.250729,91.1,80.7,1.128872,2.0,5474.0,Breast,Colon,1.350520,112.891169,True,True
107509,CHS.59109.1,CHS.59109.2,SMIM9,99,99,1.000000,66.9,66.9,1.000000,102.0,2.0,Testis,Testis,1.798313,1.436480,False,False


In [56]:
unique_genes_iso_MANE = list(df_MANE_cim_filtered["gene"].unique())
print(len(unique_genes_iso_MANE))

14057


#### Perhaps something to note

In [60]:
n_gene_all = len(unique_genes_iso_all)
n_gene_MANE = len(unique_genes_iso_MANE)
n_gene_noMANE = n_gene_all - n_gene_MANE
print(n_gene_noMANE)

# Get difference between two gene lists
difference = list(set(unique_genes_iso_all) - set(unique_genes_iso_MANE))
print(len(difference))

1794
1799


In [14]:
print((3943+132151) == 136094)

True


In [15]:
f_out = "iso_all_v1.1.filtered.132151.csv"    # original: iso_all_v1.1.csv
df_iso_all_filtered.to_csv(f_out, index=None)  ####### I may want to remove the header

In [16]:
# Generate a textfile where each line contains the CHESS ID of remaining 132151 samples
f_out_filtered_CHESSID = "CHESS_ID_iso_all_v1.1.filtered.132151.txt"
df_iso_all_filtered["CHESS ID"].to_csv(f_out_filtered_CHESSID, index=None, header=None)