# Protein Complexes in Ovarian Cancer

In [5]:
import pandas as pd
import numpy as np
import re

### Import patient data as well as a list of protein complexes from CORUM (http://mips.helmholtz-muenchen.de/corum/#download 'Complete Complexes')

In [6]:
data = pd.read_csv('proteinGroups_cleaned.txt', sep='\t')
complexData = pd.read_csv('allComplexes.txt', sep='\t')
data

Unnamed: 0,Gene_Name,01OV007,01OV007_NM,01OV008_NM,01OV010_NM,01OV013_NM,01OV017,01OV017_NM,01OV018,01OV019_NM,...,17OV001_NM,17OV002,17OV002_NM,17OV003_NM,17OV004_NM,17OV005_NM,17OV014,17OV015,17OV018,17OV026
0,RBM47,6.632300e+07,0.0,0.0,0.000000e+00,0.000000e+00,2.547900e+08,0.000000e+00,5.532400e+07,0.000000e+00,...,6.270100e+07,2.525300e+08,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,1.579100e+08,5.738500e+07
1,A1CF,6.632300e+07,0.0,0.0,0.000000e+00,0.000000e+00,2.547900e+08,0.000000e+00,5.532400e+07,0.000000e+00,...,6.270100e+07,2.525300e+08,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,1.579100e+08,5.738500e+07
2,UBA6,1.191900e+09,0.0,404400000.0,0.000000e+00,0.000000e+00,1.436700e+09,1.148600e+09,2.047900e+09,1.343800e+09,...,9.756600e+08,2.103900e+09,0.0,225790000.0,0.0,0.0,1.092300e+09,1.640500e+09,1.831100e+09,1.045900e+09
3,ESYT2,8.069400e+08,0.0,0.0,0.000000e+00,2.732400e+08,4.981100e+08,2.522700e+08,3.686300e+08,6.054700e+08,...,4.602200e+08,4.069700e+08,0.0,623870000.0,0.0,0.0,6.994500e+08,4.626500e+08,5.260800e+08,4.205100e+08
4,MED19,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,1.914400e+07,0.000000e+00
5,IGLC7,1.087100e+08,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1.187900e+08,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.0,188180000.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
6,SHTN1,5.944200e+08,0.0,0.0,0.000000e+00,0.000000e+00,2.390600e+08,0.000000e+00,1.552000e+08,9.006500e+07,...,3.132300e+08,2.271200e+08,0.0,0.0,0.0,0.0,2.068300e+08,8.116700e+07,3.273900e+08,4.013900e+08
7,DAPL1,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
8,MEX3A,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.450800e+07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
9,ILVBL,4.620300e+08,134350000.0,0.0,1.628500e+08,0.000000e+00,6.270200e+08,2.709000e+08,6.957900e+08,3.987300e+08,...,6.178200e+08,4.313800e+08,0.0,95345000.0,263050000.0,96172000.0,2.951100e+09,2.927800e+08,6.055900e+08,4.140900e+08


### Create a workable dictionary of protein complex information

In [7]:
# Select for human complexes
complexData = complexData.loc[complexData['Organism'] == 'Human']

# Split the proteins in each complex into invdividual columns of a new df
subunitNames = complexData['subunits(Gene name)'].str.split(';', expand = True)
subunitNames.index = complexData.ComplexName
subunitNames

# Create a dictionary (key = complex name, value = list of proteins in complex) and remove None values
subunitNames = subunitNames.transpose().to_dict('list')
for key, val in subunitNames.items():
    subunitNames[key] = [value for value in val if value != None]
subunitNames

  # Remove the CWD from sys.path while we load stuff.


{'BCL6-HDAC4 complex': ['BCL6', 'HDAC4'],
 'BCL6-HDAC5 complex': ['BCL6', 'HDAC5'],
 'BCL6-HDAC7 complex': ['BCL6', 'HDAC7'],
 'Multisubunit ACTR coactivator complex': ['EP300',
  'CREBBP',
  'KAT2B',
  'NCOA3'],
 'Condensin I complex': ['SMC2', 'NCAPH', 'NCAPD2', 'NCAPG', 'SMC4'],
 'BLOC-3 (biogenesis of lysosome-related organelles complex 3)': ['HPS1',
  'HPS4'],
 'BLOC-2 (biogenesis of lysosome-related organelles complex 2)': ['HPS6',
  'HPS3',
  'HPS5'],
 'MUS81-CDS1 complex': ['CDS1', 'MUS81'],
 'NCOR complex': ['HDAC3', 'TBL1X', 'NCOR1', 'GPS2', 'CORO2A', 'TBL1XR1'],
 'BLOC-1 (biogenesis of lysosome-related organelles complex 1)': ['SNAPIN',
  'BLOC1S1',
  'BLOC1S3',
  'BLOC1S2',
  'BLOC1S5',
  'DTNBP1',
  'BLOC1S4',
  'BLOC1S6'],
 'Arp2/3 protein complex': ['ARPC1B',
  'ARPC2',
  'ARPC3',
  'ARPC5',
  'ARPC4',
  'ACTR3',
  'ACTR2'],
 'PA28gamma complex': ['PSME3'],
 'PA28 complex': ['PSME1', 'PSME2'],
 'PA700 complex': ['PSMD11',
  'PSMD12',
  'PSMD9',
  'PSMD14',
  'PSMD3',
  '

### Define basic analysis functions

In [68]:
def find_ratios(protein_list, patient_id, data):
    patient_normal_id = patient_id + '_NM'
    cancer_total = 0
    normal_total = 0
    cancer_ratios = []
    normal_ratios = []
    for protein in protein_list:
        cancer_total += data.at[protein, patient_id]
        normal_total += data.at[protein, patient_normal_id]
    for protein in protein_list:
        cancer_ratios.append((data.at[protein, patient_id]) / cancer_total)
        normal_ratios.append((data.at[protein, patient_normal_id]) / normal_total)
    return cancer_ratios, normal_ratios