## Purpose
1. To remove rows where both RefSeq ID = "-" and GENECODEID = "-" 
2. To remove all rows that contain hypothetical
3. To remove all other theoretical | outlaws
3. To create a new file that satisfy both #1 and #2

In [1]:
import pandas as pd

In [2]:
f_iso_all = "iso_all_v1.1_GENE.filled.csv"
df_iso_all = pd.read_csv(f_iso_all)
df_iso_all

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194775,hypothetical.2081.1,OR6C1,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,OR6J1,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,OR52E5,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,OR8B2,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [3]:
# Check that all CHESS ID are unique
df_iso_all["CHESS ID"].value_counts()  

CHS.27152.alt4     1
CHS.53322.alt5     1
CHS.58132.alt1     1
CHS.40023.alt5     1
CHS.57700.alt16    1
                  ..
CHS.36558.alt4     1
CHS.9272.2         1
CHS.28558.alt2     1
CHS.44887.14       1
CHS.38204.19       1
Name: CHESS ID, Length: 194780, dtype: int64

In [4]:
 # Getting to know the data
df_iso_all["GENE"].value_counts() 

-            16233
MAPK10         129
ABI2           117
MBNL1          109
RAP1GAP        106
             ...  
ANTKMT           1
ARHGEF28         1
MYO5A            1
KRTAP22-1        1
NOTO             1
Name: GENE, Length: 19231, dtype: int64

### 1. Keep only rows where there is transcript info in at least one of RefSeq and GENCODE

In [7]:
# Show rows where RefSeq ID exist (not "-")
df_iso_all[df_iso_all["RefSeq ID"] != "-"].sample(10)

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
68849,CHS.20155.8,TSNAXIP1,rna-NM_018430.4,ENST00000388833.7,84.3,False
53948,CHS.13772.7,RFC3,rna-XM_017020683.2,-,83.2,False
98611,CHS.31823.18,ABI2,rna-XM_017003138.2,-,65.4,True
172890,CHS.10638.8,-,rna-XR_001748635.1,-,71.3,False
99731,CHS.32279.39,SP140,rna-XM_017003253.1,-,55.2,False
81907,CHS.25320.8,CELF5,rna-NM_001172673.2,ENST00000541430.6,60.8,False
3277,CHS.11226.2,IRAG2,rna-NM_001366544.2,-,57.6,False
44526,CHS.10169.9,HINFP,rna-NM_001351959.2,-,72.8,False
162844,CHS.27297.12,VRK3,rna-XM_005258971.4,ENST00000599538.5,78.0,True
101175,CHS.32867.18,GPCPD1,rna-XM_005260761.2,-,84.9,True


In [8]:
# Show rows where GENECODE ID exist (not "-")
df_iso_all[df_iso_all["GENCODE ID"] != "-"].sample(10)

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
89374,CHS.27458.alt8,ZNF611,-,ENST00000595798.5,45.2,False
113188,CHS.37625.7,FOXP1,rna-NM_001244815.2,ENST00000649695.2,53.4,True
138003,CHS.50538.13,CCM2,-,ENST00000488727.5,70.1,True
113441,CHS.37817.13,DCBLD2,-,ENST00000449482.1,78.8,False
189268,CHS.54311.26,-,-,ENST00000523734.1,78.0,True
55208,CHS.14590.3,PCCA,rna-NM_001127692.3,ENST00000376286.8,89.1,True
127531,CHS.44332.alt1,SPINK9,-,ENST00000511717.6,71.3,False
14714,CHS.58771.5,SLC25A14,rna-NM_001282195.2,ENST00000545805.6,68.2,True
132834,CHS.46699.alt10,LMBRD1,-,ENST00000647650.1,69.7,True
50898,CHS.12473.12,NTN4,rna-NM_001329701.2,ENST00000538383.5,84.6,True


In [9]:
# Count the number of rows where both RefSeq and GENCODE ID are "-"
df_iso_all[(df_iso_all["RefSeq ID"] == "-") & (df_iso_all["GENCODE ID"] == "-")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
16152,CHS.39.12,SAMD11,-,-,44.1,False
16154,CHS.39.2,SAMD11,-,-,43.7,False
16156,CHS.39.23,SAMD11,-,-,55.2,True
16159,CHS.39.alt1,SAMD11,-,-,58.5,False
16161,CHS.39.alt3,SAMD11,-,-,46.5,False
...,...,...,...,...,...,...
194448,hypothetical.2209.2,-,-,-,50.3,False
194450,hypothetical.2211.1,-,-,-,55.1,False
194452,hypothetical.1796.1,-,-,-,50.8,False
194453,hypothetical.1796.2,-,-,-,49.9,False


In [10]:
df_iso_all_RGExist = df_iso_all[(df_iso_all["RefSeq ID"] != "-") | (df_iso_all["GENCODE ID"] != "-")]
df_iso_all_RGExist

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194775,hypothetical.2081.1,OR6C1,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,OR6J1,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,OR52E5,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,OR8B2,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [11]:
# Check whether there are any rows containing "-" in both "RefSeq ID" and "GENCODE ID" cols
df_iso_all_RGExist[(df_iso_all_RGExist["RefSeq ID"] == "-") & (df_iso_all_RGExist["GENCODE ID"] == "-")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse


In [12]:
print((df_iso_all[(df_iso_all["RefSeq ID"] == "-") & (df_iso_all["GENCODE ID"] == "-")].shape[0]+df_iso_all_RGExist.shape[0]) == df_iso_all.shape[0])

True


### 2. Remove all hypothetical

In [13]:
df_iso_all_RGExist

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194775,hypothetical.2081.1,OR6C1,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,OR6J1,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,OR52E5,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,OR8B2,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [14]:
# Check the number of CHESS ID containing "hypothetical"
df_iso_all_RGExist[df_iso_all_RGExist["CHESS ID"].str.contains("hypothetical")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
141,hypothetical.0.1,PRAMEF27,rna-NM_001300891.2,ENST00000436041.6,84.7,False
143,hypothetical.1.1,PRAMEF26,rna-NM_001306072.3,ENST00000624207.1,84.7,False
144,hypothetical.2.1,HNRNPCL4,rna-NM_001302551.2,ENST00000323770.8,65.2,no_introns
145,hypothetical.3.1,PRAMEF9,rna-NM_001010890.3,ENST00000415919.3,84.8,False
207,hypothetical.4.1,PLA2G2C,rna-NM_001367969.2,ENST00000679259.1,71.8,False
...,...,...,...,...,...,...
194775,hypothetical.2081.1,OR6C1,rna-NM_001005182.2,ENST00000642104.1,88.9,True
194776,hypothetical.2082.1,OR6J1,rna-NM_001348233.2,ENST00000540461.2,83.2,False
194777,hypothetical.2083.1,OR52E5,rna-NM_001005166.5,ENST00000610445.2,83.5,False
194778,hypothetical.2084.1,OR8B2,rna-NM_001005468.2,ENST00000641451.2,89.3,True


In [15]:
# Create a df where, from the output df from step 1, rows where CHESS ID column contains "hypothetical" are removed
df_iso_all_filtered = df_iso_all_RGExist[~df_iso_all_RGExist["CHESS ID"].str.contains("hypothetical")]
df_iso_all_filtered

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
0,CHS.15.1,OR4F29,rna-NM_001005221.2,ENST00000426406.4,88.2,no_introns
1,CHS.39.7,SAMD11,rna-NM_001385641.1,ENST00000616016.5,46.4,False
2,CHS.40.3,NOC2L,rna-NM_015658.4,ENST00000327044.7,71.1,False
3,CHS.41.11,PLEKHN1,rna-NM_032129.3,ENST00000379410.8,49.4,False
4,CHS.42.6,KLHL17,rna-NM_198317.3,ENST00000338591.8,86.3,True
...,...,...,...,...,...,...
194662,CHS.34304.1,-,rna-NM_001386820.1,ENST00000624951.1,52.2,True
194664,CHS.613.1,TAS1R2,rna-NM_152232.4,ENST00000375371.3,86.0,True
194665,CHS.10998.1,-,rna-NM_023922.1,-,82.1,no_introns
194666,CHS.11009.1,-,rna-NM_001097643.1,-,84.7,no_introns


#### Used during MTG with Sushant 7/11, 2022

In [19]:
df_iso_all_RGExist[df_iso_all_RGExist["GENE"] == "-"]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
168077,CHS.2950.1,-,rna-NM_003516.3,ENST00000369159.2,89.6,no_introns
168078,CHS.3243.alt1,-,rna-NM_001394530.1,ENST00000392487.1,66.4,no_introns
168080,CHS.4805.2,-,rna-NM_033445.3,-,89.3,no_introns
168081,CHS.4806.2,-,rna-NM_175055.3,-,84.4,no_introns
168087,hypothetical.2088.1,-,rna-NM_001365552.1,-,57.4,False
...,...,...,...,...,...,...
194756,hypothetical.2062.1,-,id-TRBVAOR9-2,-,30.9,no_introns
194764,hypothetical.2070.1,-,id-TRGV10,-,88.0,False
194765,hypothetical.2071.1,-,id-TRGV11,-,69.3,False
194766,hypothetical.2072.1,-,id-TRGVA,-,58.2,False


In [20]:
df_iso_all_RGExist[(df_iso_all_RGExist["GENE"] == "-") & (df_iso_all_RGExist["GENCODE ID"] != "-")]

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
168077,CHS.2950.1,-,rna-NM_003516.3,ENST00000369159.2,89.6,no_introns
168078,CHS.3243.alt1,-,rna-NM_001394530.1,ENST00000392487.1,66.4,no_introns
168101,CHS.25184.14,-,rna-NM_001393918.1,ENST00000650044.1,44.8,False
168106,CHS.33579.12,-,rna-NM_000457.6,ENST00000316099.9,70.6,True
168107,CHS.34256.3,-,rna-NM_001363770.2,ENST00000623960.4,80.4,False
...,...,...,...,...,...,...
194628,CHS.3095.1,-,rna-NM_001310142.1,ENST00000649623.1,45.8,no_introns
194629,hypothetical.2213.1,-,rna-NM_001257362.2,ENST00000616049.4,68.0,False
194631,hypothetical.1951.1,-,rna-NM_001368231.1,ENST00000621028.1,46.3,False
194632,hypothetical.1952.1,-,rna-NM_001368242.1,ENST00000613204.1,83.5,True


### Count genes without MANE transcript

In [21]:
df_iso_all_filtered["GENE"].value_counts()

-           5281
MAPK10       104
ABI2         103
MBNL1        102
CELF1         93
            ... 
USP17L24       1
MYOC           1
MYOG           1
RTF1           1
NOTO           1
Name: GENE, Length: 18457, dtype: int64

In [22]:
unique_genes_iso_all = list(df_iso_all_filtered["GENE"].unique())
unique_genes_iso_all.remove("-")
print(len(unique_genes_iso_all))

18456


In [23]:
# Obtaining value for MANE (after processing, MANE transcripts added as rows)
f_MANE_cim = "iso_MANE_cim_fixed.csv"
df_MANE_cim = pd.read_csv(f_MANE_cim)
df_MANE_cim_filtered = df_MANE_cim[~df_MANE_cim["CHESS_ID"].str.contains("hypothetical")]
df_MANE_cim_filtered

Unnamed: 0,CHESS_ID,CHESS_ID_MANE,gene,aa_length_isoform,aa_length_MANE,length_ratio,pLDDT_isoform,pLDDT_MANE,pLDDT_ratio,GTEx_samples_observed_isoform,GTEx_samples_observed_MANE,GTEx_top_tissue_name_isoform,GTEx_top_tissue_name_MANE,GTEx_top_tissue_TPM_isoform,GTEx_top_tissue_TPM_MANE,introns_conserved_in_mouse_isoform,introns_conserved_in_mouse_MANE
0,CHS.59119.2,CHS.59119.2,CLIC2,247,247,1.000000,92.6,92.6,1.000000,9405.0,9405.0,Bone_Marrow,Bone_Marrow,63.284808,63.284808,False,False
1,CHS.59109.2,CHS.59109.2,SMIM9,99,99,1.000000,66.9,66.9,1.000000,2.0,2.0,Testis,Testis,1.436480,1.436480,False,False
2,CHS.59041.5,CHS.59041.5,PNCK,343,343,1.000000,80.7,80.7,1.000000,5474.0,5474.0,Colon,Colon,112.891169,112.891169,True,True
4,CHS.58995.1,CHS.58995.1,MAGEA10,369,369,1.000000,72.3,72.3,1.000000,181.0,181.0,Testis,Testis,1.967843,1.967843,False,False
6,CHS.58681.7,CHS.58681.7,TMEM255A,325,325,1.000000,63.5,63.5,1.000000,3788.0,3788.0,Ovary,Ovary,14.303998,14.303998,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107506,CHS.59041.26,CHS.59041.5,PNCK,92,343,0.268222,91.7,80.7,1.136307,70.0,5474.0,Nerve,Colon,2.181060,112.891169,True,True
107507,CHS.59041.28,CHS.59041.5,PNCK,119,343,0.346939,71.4,80.7,0.884758,90.0,5474.0,Blood_Vessel,Colon,2.043669,112.891169,True,True
107508,CHS.59041.27,CHS.59041.5,PNCK,86,343,0.250729,91.1,80.7,1.128872,2.0,5474.0,Breast,Colon,1.350520,112.891169,True,True
107509,CHS.59109.1,CHS.59109.2,SMIM9,99,99,1.000000,66.9,66.9,1.000000,102.0,2.0,Testis,Testis,1.798313,1.436480,False,False


In [24]:
unique_genes_iso_MANE = list(df_MANE_cim_filtered["gene"].unique())
print(len(unique_genes_iso_MANE))

14057


#### Perhaps something to note

In [25]:
n_gene_all = len(unique_genes_iso_all)
n_gene_MANE = len(unique_genes_iso_MANE)
n_gene_noMANE = n_gene_all - n_gene_MANE
print(n_gene_noMANE)

# Get difference between two gene lists
difference = list(set(unique_genes_iso_all) - set(unique_genes_iso_MANE))
print(len(difference))

4399
4401


In [26]:
print((3943+132151) == 136094)

True


In [27]:
f_out = "iso_all_v1.1.filtered.132151.csv"    # original: iso_all_v1.1.csv
df_iso_all_filtered.to_csv(f_out, index=None)  ####### I may want to remove the header

In [28]:
# Generate a textfile where each line contains the CHESS ID of remaining 132151 samples
f_out_filtered_CHESSID = "CHESS_ID_iso_all_v1.1.filtered.132151.txt"
df_iso_all_filtered["CHESS ID"].to_csv(f_out_filtered_CHESSID, index=None, header=None)

The dimensions of resulting dfs in this file should not changed even after filling in missing GENE with mart_export.txt, because GENE is not used in filtering