In [114]:
import pandas as pd
pd.set_option('display.max_columns', 45)
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_rows', 50)

histone_genes_list_old = 'HIST1H2BB HIST1H2BC HIST1H2BE HIST1H2BF HIST1H2BG HIST1H2BI HIST1H2BD HIST1H2BH HIST1H2BJ HIST1H2BK HIST1H2BL \
           HIST1H2BM HIST1H2BN HIST1H2BO HIST2H2BE HIST2H2BF HIST3H2BB H2BFS H2BFM H2BFWT HIST1H2BA HIST1H2AG HIST1H2AI \
           HIST1H2AK HIST1H2AL HIST1H2AM HIST1H2AC HIST1H2AD HIST1H2AE HIST1H2AH HIST1H2AJ HIST2H2AB HIST2H2AC HIST3H2A \
           HIST1H2APS4 H2AFJ H2AFX H2AFZ H2AFV H2AFY H2AFY2 HIST1H2AA H2AFB1 H2AFB2 H2AFB3 HYPM HIST1H3A HIST1H3B HIST1H3C \
           HIST1H3D HIST1H3E HIST1H3F HIST1H3G HIST1H3H HIST1H3I HIST1H3J HIST2H3A HIST2H3C HIST2H3D HIST2H3PS2 H3F3A H3F3B \
           H3F3C CENPA HIST3H3 HIST1H4A HIST1H4B HIST1H4C HIST1H4D HIST1H4E HIST1H4F HIST1H4H HIST1H4I HIST1H4J HIST1H4K HIST1H4L HIST2H4A HIST2H4B HIST4H4'
histone_genes_list_old = histone_genes_list_old.split()
histone_genes_list_old = [x.lower() for x in histone_genes_list_old]

In [115]:
hist_genes = pd.read_csv('histone_genes.csv', sep=',', low_memory=False)
rawdata = pd.read_csv('rawdata.txt', sep='\t', low_memory=False)

In [120]:
# Retrieve only histone interactions
histone_genes_table = pd.read_csv('histone_genes.csv', sep=',', low_memory=False)
histone_genes_list = histone_genes_table.loc[:,'HGNC Symbol'].tolist()
histone_genes_list = [x.lower() for x in histone_genes_list if str(x) != 'nan']

rawdata['Alias(es) interactor B'] = rawdata['Alias(es) interactor B'].str.lower()
rawdata['Alias(es) interactor A'] = rawdata['Alias(es) interactor A'].str.lower()

boolean=rawdata['Alias(es) interactor A'].str.contains('|'.join(histone_genes_list))|rawdata['Alias(es) interactor B'].str.contains('|'.join(histone_genes_list))
rawdata1 = rawdata[boolean]

# Filter out expanded co-complexes
data = rawdata1[rawdata1['Interaction type(s)'].str.contains('direct interaction|physical association')]

In [121]:
# Delete rows whith no HGNC identifiers
data=data[data['Alias(es) interactor B'].str.contains(r'.*uniprotkb:.{0,12}\(gene name\).*')&data['Alias(es) interactor A'].str.contains(r'.*uniprotkb:.{0,12}\(gene name\).*')]

In [122]:
# Extract HGNC identifiers from 'Alias(es) interactor A' and 'Alias(es) interactor B' columns >> *|uniprotkb:HGNC(gene name)|*
data['Alias(es) interactor A']=data['Alias(es) interactor A'].replace({r'.*(\|uniprotkb:)(.{0,12})(\(gene name\)\|).*' : r'\2'}, regex=True)
data['Alias(es) interactor B']=data['Alias(es) interactor B'].replace({r'.*(\|uniprotkb:)(.{0,12})(\(gene name\)\|).*' : r'\2'}, regex=True)
# Change names of columns
data=data.rename({'Alias(es) interactor A' : 'protein1', 'Alias(es) interactor B' : 'protein2'}, axis=1)

In [123]:
# Human genes only
data=data.loc[(data['Taxid interactor A']=='taxid:9606(human)|taxid:9606(Homo sapiens)')&(data['Taxid interactor B']=='taxid:9606(human)|taxid:9606(Homo sapiens)')]

In [124]:
# Protein interactions
data=data.loc[(data['Type(s) interactor A']=='psi-mi:"MI:0326"(protein)')&(data['Type(s) interactor B']=='psi-mi:"MI:0326"(protein)')]

In [125]:
# Select only needed columns
data['Interaction detection method(s)']=data['Interaction detection method(s)'].replace({r'(.*(?<="\()(.*?)(?=\)).*)' : r'\2'}, regex=True)
data['Interaction type(s)']=data['Interaction type(s)'].replace({r'(.*(?<="\()(.*?)(?=\)).*)' : r'\2'}, regex=True)
data['Source database(s)']=data['Source database(s)'].replace({r'(.*(?<="\()(.*?)(?=\)).*)' : r'\2'}, regex=True)
data['Confidence value(s)']=data['Confidence value(s)'].replace({r'(.*\:)(.*?)' : r'\2'}, regex=True)
data['Experimental role(s) interactor A']=data['Experimental role(s) interactor A'].replace({r'(.*(?<="\()(.*?)(?=\)).*)' : r'\2'}, regex=True)
data['Experimental role(s) interactor B']=data['Experimental role(s) interactor B'].replace({r'(.*(?<="\()(.*?)(?=\)).*)' : r'\2'}, regex=True)
data=data[['protein1',
         'protein2',
         'Interaction detection method(s)',
         'Interaction type(s)',
         'Source database(s)',
         'Confidence value(s)',
         'Experimental role(s) interactor A',
         'Experimental role(s) interactor B']]

In [131]:
data=pd.merge(data, hist_genes[['Histone type', 'HGNC Symbol', 'Canonicity']], how='left', left_on='protein1', right_on='HGNC Symbol', )
data=data.drop(columns=['HGNC Symbol'])

In [132]:
data_2=data.loc[~(data['protein2'].isin(hist_genes['HGNC Symbol']))]

In [133]:
intact_interactors=data['protein2'].drop_duplicates()
intact_interactors.to_csv('intact_interactors.csv', index=False, header=True)

intact_interactors_2=data_2['protein2'].drop_duplicates()
intact_interactors_2.to_csv('intact_interactors_2.csv', index=False, header=True)

In [134]:
intact_interaction=data_2.drop_duplicates()
intact_interaction=intact_interaction[['protein1', 'protein2', 'Interaction detection method(s)', 'Confidence value(s)','Source database(s)']]
intact_interaction.to_csv('intact_interaction.csv', index=False, header=True)

In [37]:
# Here processing ends (cell 31). Next 107th cell is analysis