## Imports and Setup

In [2]:
import pandas as pd 
import numpy as np

In [3]:
raw_x = pd.read_csv("data/X_train.csv")
raw_y = pd.read_csv("data/y_train.csv")

In [4]:
raw_data = raw_x.copy()
raw_data["Regulated"] = raw_y["Regulated"]

In [5]:
#data is initially split it x and y, combine it into one df for cleaning
raw_data.head(5)

Unnamed: 0,chr,start,end,name,chrTSS,startTSS,endTSS,TargetGene,ReferenceID,CellType,...,normalizedDNase_enh,normalizedDNase_prom,numNearbyEnhancers,sumNearbyEnhancers,ubiquitousExpressedGene,3DContact,3DContact_squared,normalizedDNase_enh_squared,ABC.Score,Regulated
0,chr8,60284882,60285382,CA8|chr8:61197441-61197941:.,chr8,60281411.0,60281412.0,CA8,2f957f7a,K562,...,9.255186,0.691692,46,215.634522,0,11482.33236,131844000.0,85.658468,0.229877,True
1,chr9,99075179,99075797,TGFBR1|chr9:101837461-101838079:.,chr9,99104038.0,99104039.0,TGFBR1,2f957f7a,K562,...,18.483246,0.798897,98,1196.610723,0,1833.345234,3361155.0,341.630383,0.04443,True
2,chr7,134639833,134640333,BPGM|chr7:134324585-134325085:.,chr7,134646808.0,134646809.0,BPGM,2f957f7a,K562,...,6.521952,0.992838,48,65.289744,0,10744.95036,115454000.0,42.535858,0.053048,True
3,chr10,5472235,5472735,NET1|chr10:5514198-5514698:.,chr10,5412551.0,5412552.0,NET1,2f957f7a,K562,...,2.925616,0.755541,33,94.653009,0,1419.128316,2013925.0,8.559229,0.004903,True
4,chr4,55723369,55723869,NMU|chr4:56589536-56590036:.,chr4,55636697.0,55636698.0,NMU,2f957f7a,K562,...,0.979523,0.598229,29,21.614425,0,793.318224,629353.8,0.959465,0.001483,True


In [6]:
(raw_data['TargetGene'].value_counts() > 1).sum()

1045

In [7]:
raw_data.iloc[0]

chr                                                    chr8
start                                              60284882
end                                                60285382
name                           CA8|chr8:61197441-61197941:.
chrTSS                                                 chr8
startTSS                                        6.02814e+07
endTSS                                          6.02814e+07
TargetGene                                              CA8
ReferenceID                                        2f957f7a
CellType                                               K562
EffectSize                                        -0.140258
pValueAdjusted                                     1.53e-06
Significant                                            True
PowerAtEffectSize25                                    0.65
PowerAtEffectSize10                                     0.2
PowerAtEffectSize15                                    0.25
PowerAtEffectSize20                     

In [8]:
(raw_data['chr'] == raw_data['chrTSS']).sum()

8434

## Data Cleaning

In [9]:
#check each columb for missing values
raw_data.isna().sum()
#when checking which class the missing values belong too, almost all of them belong to the regulated == False category
#this indicates that they could be features with high importance

chr                               0
start                             0
end                               0
name                              0
chrTSS                            0
startTSS                         70
endTSS                           70
TargetGene                        0
ReferenceID                       0
CellType                          0
EffectSize                        0
pValueAdjusted                   15
Significant                       0
PowerAtEffectSize25               6
PowerAtEffectSize10            2896
PowerAtEffectSize15            2896
PowerAtEffectSize20            2896
PowerAtEffectSize50            2896
distanceToTSS                     0
numTSSEnhGene                     0
numCandidateEnhGene               0
normalizedDNase_enh               0
normalizedDNase_prom              0
numNearbyEnhancers                0
sumNearbyEnhancers                0
ubiquitousExpressedGene           0
3DContact                         0
3DContact_squared           

In [10]:
#Checking for the prescence of missing values in "Regulated" = True
raw_data[raw_data["Regulated"]==True].isna().sum()

chr                             0
start                           0
end                             0
name                            0
chrTSS                          0
startTSS                        0
endTSS                          0
TargetGene                      0
ReferenceID                     0
CellType                        0
EffectSize                      0
pValueAdjusted                 14
Significant                     0
PowerAtEffectSize25             6
PowerAtEffectSize10            71
PowerAtEffectSize15            71
PowerAtEffectSize20            71
PowerAtEffectSize50            71
distanceToTSS                   0
numTSSEnhGene                   0
numCandidateEnhGene             0
normalizedDNase_enh             0
normalizedDNase_prom            0
numNearbyEnhancers              0
sumNearbyEnhancers              0
ubiquitousExpressedGene         0
3DContact                       0
3DContact_squared               0
normalizedDNase_enh_squared     0
ABC.Score     

In [11]:
#Checking for the prescence of missing values in "Regulated" = False
raw_data[raw_data["Regulated"]==False].isna().sum()

chr                               0
start                             0
end                               0
name                              0
chrTSS                            0
startTSS                         70
endTSS                           70
TargetGene                        0
ReferenceID                       0
CellType                          0
EffectSize                        0
pValueAdjusted                    1
Significant                       0
PowerAtEffectSize25               0
PowerAtEffectSize10            2825
PowerAtEffectSize15            2825
PowerAtEffectSize20            2825
PowerAtEffectSize50            2825
distanceToTSS                     0
numTSSEnhGene                     0
numCandidateEnhGene               0
normalizedDNase_enh               0
normalizedDNase_prom              0
numNearbyEnhancers                0
sumNearbyEnhancers                0
ubiquitousExpressedGene           0
3DContact                         0
3DContact_squared           

In [12]:
#based off this we are taking the deicison to drop columns missing startTSS, endTSS and pValueAdjusted
raw_data= raw_data.dropna(subset=['startTSS', 'endTSS', 'pValueAdjusted'])

In [13]:
# according to the compeition write up the PowerAtEffectSize__ Columns 15-19
#were "used to filter the dataset and are most likely not relevant", hence we will drop those columns
raw_data_cleaned = raw_data.drop(columns=['name', 'CellType', "PowerAtEffectSize10",'PowerAtEffectSize25', 'PowerAtEffectSize15', 'PowerAtEffectSize20', 'PowerAtEffectSize50'])

## Feature Engineering

In [14]:
raw_data_cleaned.head()

Unnamed: 0,chr,start,end,chrTSS,startTSS,endTSS,TargetGene,ReferenceID,EffectSize,pValueAdjusted,...,normalizedDNase_enh,normalizedDNase_prom,numNearbyEnhancers,sumNearbyEnhancers,ubiquitousExpressedGene,3DContact,3DContact_squared,normalizedDNase_enh_squared,ABC.Score,Regulated
0,chr8,60284882,60285382,chr8,60281411.0,60281412.0,CA8,2f957f7a,-0.140258,2e-06,...,9.255186,0.691692,46,215.634522,0,11482.33236,131844000.0,85.658468,0.229877,True
1,chr9,99075179,99075797,chr9,99104038.0,99104039.0,TGFBR1,2f957f7a,-0.046138,0.00487,...,18.483246,0.798897,98,1196.610723,0,1833.345234,3361155.0,341.630383,0.04443,True
2,chr7,134639833,134640333,chr7,134646808.0,134646809.0,BPGM,2f957f7a,-0.08218,0.007175,...,6.521952,0.992838,48,65.289744,0,10744.95036,115454000.0,42.535858,0.053048,True
3,chr10,5472235,5472735,chr10,5412551.0,5412552.0,NET1,2f957f7a,-0.100327,0.040277,...,2.925616,0.755541,33,94.653009,0,1419.128316,2013925.0,8.559229,0.004903,True
4,chr4,55723369,55723869,chr4,55636697.0,55636698.0,NMU,2f957f7a,-0.163492,1.1e-05,...,0.979523,0.598229,29,21.614425,0,793.318224,629353.8,0.959465,0.001483,True


In [15]:
raw_data_cleaned[raw_data_cleaned['chr'] == 'chr8']

Unnamed: 0,chr,start,end,chrTSS,startTSS,endTSS,TargetGene,ReferenceID,EffectSize,pValueAdjusted,...,normalizedDNase_enh,normalizedDNase_prom,numNearbyEnhancers,sumNearbyEnhancers,ubiquitousExpressedGene,3DContact,3DContact_squared,normalizedDNase_enh_squared,ABC.Score,Regulated
0,chr8,60284882,60285382,chr8,60281411.0,60281412.0,CA8,2f957f7a,-0.140258,1.530000e-06,...,9.255186,0.691692,46,215.634522,0,11482.332360,1.318440e+08,85.658468,0.229877,True
15,chr8,127898896,127898963,chr8,127735434.0,127735435.0,MYC,39b33148,-0.358840,2.440000e-11,...,33.018915,0.826261,66,506.155122,0,882.612580,7.790050e+05,1090.248748,0.033564,True
19,chr8,129696899,129699399,chr8,127735434.0,127735435.0,MYC,0c4fc18d,-0.047932,2.189944e-02,...,5.626645,0.826261,76,687.960873,0,520.179979,2.705872e+05,32.570174,0.006742,True
67,chr8,11879271,11880174,chr8,11869447.0,11869448.0,CTSB,2f957f7a,-0.072544,1.678477e-02,...,18.524897,0.538687,44,130.174176,0,4693.795549,2.203172e+07,343.171809,0.109024,True
83,chr8,124637449,124637892,chr8,125091679.0,125091680.0,NSMCE2,ebda246b,-0.142877,3.721736e-02,...,7.031228,0.745355,107,401.896238,1,347.195141,1.205445e+05,49.438167,0.003920,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8396,chr8,98612764,98613320,chr8,98046469.0,98046470.0,RPL30,ebda246b,-0.017201,1.000000e+00,...,1.111162,0.693990,0,0.000000,1,111.969522,1.253717e+04,1.234681,0.000249,False
8407,chr8,124626550,124627024,chr8,124539101.0,124539102.0,NDUFB9,ebda246b,-0.017164,1.000000e+00,...,2.685397,0.807341,72,576.919332,1,565.680071,3.199939e+05,7.211357,0.001858,False
8418,chr8,103007764,103008230,chr8,102893864.0,102893865.0,AZIN1,ebda246b,-0.043117,1.000000e+00,...,0.665236,0.811405,93,144.396388,1,921.318060,8.488270e+05,0.442539,0.000548,False
8424,chr8,100395272,100395928,chr8,100150584.0,100150585.0,POLR2K,ebda246b,-0.058607,1.000000e+00,...,4.544567,0.712620,36,182.588436,0,634.785491,4.029526e+05,20.931166,0.009587,False


In [16]:
raw_data_cleaned.iloc[0]

chr                                   chr8
start                             60284882
end                               60285382
chrTSS                                chr8
startTSS                       6.02814e+07
endTSS                         6.02814e+07
TargetGene                             CA8
ReferenceID                       2f957f7a
EffectSize                       -0.140258
pValueAdjusted                    1.53e-06
Significant                           True
distanceToTSS                       3767.5
numTSSEnhGene                            1
numCandidateEnhGene                      1
normalizedDNase_enh                9.25519
normalizedDNase_prom              0.691692
numNearbyEnhancers                      46
sumNearbyEnhancers                 215.635
ubiquitousExpressedGene                  0
3DContact                          11482.3
3DContact_squared              1.31844e+08
normalizedDNase_enh_squared        85.6585
ABC.Score                         0.229877
Regulated  

In [17]:
raw_data_cleaned.isna().sum()

chr                            0
start                          0
end                            0
chrTSS                         0
startTSS                       0
endTSS                         0
TargetGene                     0
ReferenceID                    0
EffectSize                     0
pValueAdjusted                 0
Significant                    0
distanceToTSS                  0
numTSSEnhGene                  0
numCandidateEnhGene            0
normalizedDNase_enh            0
normalizedDNase_prom           0
numNearbyEnhancers             0
sumNearbyEnhancers             0
ubiquitousExpressedGene        0
3DContact                      0
3DContact_squared              0
normalizedDNase_enh_squared    0
ABC.Score                      0
Regulated                      0
dtype: int64

In [18]:
raw_data_cleaned['chr'].value_counts()

chr19    1826
chr11    1128
chr1      704
chr3      673
chrX      620
chr8      601
chr12     473
chr6      444
chr17     275
chr2      262
chr7      254
chr5      216
chr16     145
chr14     117
chr10      98
chr20      98
chr4       95
chr22      84
chr9       83
chr18      62
chr15      54
chr21      27
chr13      10
Name: chr, dtype: int64

In [19]:
raw_data_cleaned['TargetGene'].nunique()

1963

In [20]:
one_hot_encoded_target_gene = pd.get_dummies(raw_data_cleaned, columns = ['TargetGene'])
ref_id = pd.get_dummies(one_hot_encoded_target_gene, columns = ['ReferenceID'])
raw_data_cleaned = pd.get_dummies(ref_id, columns = ['chr'])

In [21]:
list(raw_data_cleaned.columns)[1900:]

['TargetGene_VAMP8',
 'TargetGene_VAPA',
 'TargetGene_VARS',
 'TargetGene_VASP',
 'TargetGene_VCL',
 'TargetGene_VDAC1',
 'TargetGene_VDAC3',
 'TargetGene_VKORC1',
 'TargetGene_VKORC1L1',
 'TargetGene_VMA21',
 'TargetGene_VPS25',
 'TargetGene_VPS29',
 'TargetGene_VPS72',
 'TargetGene_VRK1',
 'TargetGene_VTI1B',
 'TargetGene_WARS',
 'TargetGene_WASF2',
 'TargetGene_WBP11',
 'TargetGene_WDR13',
 'TargetGene_WDR18',
 'TargetGene_WDR3',
 'TargetGene_WDR46',
 'TargetGene_WDR61',
 'TargetGene_WDR74',
 'TargetGene_WDR77',
 'TargetGene_WDR83OS',
 'TargetGene_WIPF3',
 'TargetGene_WIPI2',
 'TargetGene_WNK1',
 'TargetGene_WSB1',
 'TargetGene_WTAP',
 'TargetGene_XPR1',
 'TargetGene_XRCC6',
 'TargetGene_YARS',
 'TargetGene_YBX1',
 'TargetGene_YBX3',
 'TargetGene_YIF1A',
 'TargetGene_YIPF3',
 'TargetGene_YIPF6',
 'TargetGene_YKT6',
 'TargetGene_YRDC',
 'TargetGene_YTHDC1',
 'TargetGene_YTHDF2',
 'TargetGene_YWHAE',
 'TargetGene_YWHAG',
 'TargetGene_YWHAH',
 'TargetGene_YWHAQ',
 'TargetGene_YWHAZ',
 

In [22]:
# raw_data_cleaned = raw_data_cleaned.drop(columns=['TargetGene'])
# raw_data_cleaned

## LOG REG

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

In [24]:
all_chrosomome_ids = list(range(0, 23))
auprc_test = [0]*len(all_chrosomome_ids)
auprc_train = [0]*len(all_chrosomome_ids)

for k in all_chrosomome_ids:
    if k == 22:
        test_idx = 'X'
    else:
        test_idx = int(k) + 1
    
    train = raw_data_cleaned[raw_data_cleaned['chrTSS'] != f'chr{test_idx}']
    test = raw_data_cleaned[raw_data_cleaned['chrTSS'] == f'chr{test_idx}']
    
    X_train = train[['distanceToTSS', 'numTSSEnhGene',
       'numCandidateEnhGene', 'normalizedDNase_enh', 'normalizedDNase_prom',
       'numNearbyEnhancers', 'sumNearbyEnhancers', 'ubiquitousExpressedGene',
       '3DContact', '3DContact_squared', 'normalizedDNase_enh_squared',
       'ABC.Score']]
    y_train = train['Significant']
    
    X_test = test[['distanceToTSS', 'numTSSEnhGene',
       'numCandidateEnhGene', 'normalizedDNase_enh', 'normalizedDNase_prom',
       'numNearbyEnhancers', 'sumNearbyEnhancers', 'ubiquitousExpressedGene',
       '3DContact', '3DContact_squared', 'normalizedDNase_enh_squared',
       'ABC.Score']]
    y_test = test['Significant']
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    train_yscore = model.predict_proba(X_train)[:, 1]
    test_yscore = model.predict_proba(X_test)[:, 1]
    
    auprc_train[k] = average_precision_score(y_train, train_yscore)
    auprc_test[k] = average_precision_score(y_test, test_yscore)
    
train_score = np.mean(auprc_train)
test_score = np.mean(auprc_test)

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.36336956067073506
Test Score: 0.49808936484435157


# XGBoost

In [23]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from xgboost import XGBClassifier

In [24]:
all_chrosomome_ids = list(range(0, 23))
auprc_test = [0]*len(all_chrosomome_ids)
auprc_train = [0]*len(all_chrosomome_ids)

for k in all_chrosomome_ids:
    if k == 22:
        test_idx = 'X'
    else:
        test_idx = int(k) + 1
    
    train = raw_data_cleaned[raw_data_cleaned['chrTSS'] != f'chr{test_idx}']
    test = raw_data_cleaned[raw_data_cleaned['chrTSS'] == f'chr{test_idx}']
    
    X_train = train[['distanceToTSS', 'numTSSEnhGene',
       'numCandidateEnhGene', 'normalizedDNase_enh', 'normalizedDNase_prom',
       'numNearbyEnhancers', 'sumNearbyEnhancers', 'ubiquitousExpressedGene',
       '3DContact', '3DContact_squared', 'normalizedDNase_enh_squared',
       'ABC.Score']]
    y_train = train['Significant']
    
    X_test = test[['distanceToTSS', 'numTSSEnhGene',
       'numCandidateEnhGene', 'normalizedDNase_enh', 'normalizedDNase_prom',
       'numNearbyEnhancers', 'sumNearbyEnhancers', 'ubiquitousExpressedGene',
       '3DContact', '3DContact_squared', 'normalizedDNase_enh_squared',
       'ABC.Score']]
    y_test = test['Significant']
    
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)
    
    train_yscore = model.predict_proba(X_train)[:, 1]
    test_yscore = model.predict_proba(X_test)[:, 1]
    
    auprc_train[k] = average_precision_score(y_train, train_yscore)
    auprc_test[k] = average_precision_score(y_test, test_yscore)
    
train_score = np.mean(auprc_train)
test_score = np.mean(auprc_test)

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 1.0
Test Score: 0.6461810231733236


In [25]:
pd.DataFrame(auprc_train)

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [26]:
pd.DataFrame(auprc_test)

Unnamed: 0,0
0,0.785834
1,0.594089
2,0.316301
3,0.737733
4,0.52045
5,0.593337
6,0.774251
7,0.361262
8,0.739206
9,0.838299
