In [1]:
import pandas as pd

df = pd.read_pickle('df.pkl')

agg_functions = {
    'gene_id': 'first',
    'combined nucleotides': 'first',
    'dwelling_time1': 'median',
    'sd1': 'median',
    'mean1': 'median',
    'dwelling_time2': 'median',
    'sd2': 'median',
    'mean2': 'median',
    'dwelling_time3': 'median',
    'sd3': 'median',
    'mean3': 'median',
    'label': 'sum'
}

summary_df = df.groupby(['transcript_id', 'transcript_position']).agg(agg_functions).reset_index()
summary_df['count'] = df.groupby(['transcript_id', 'transcript_position']).size().reset_index(name='count')['count']
summary_df['label_percentage'] = summary_df['label'] / summary_df['count']
summary_df.drop('label', axis = 'columns', inplace = True)
summary_df['label'] = df['label']

In [2]:
summary_df

Unnamed: 0,transcript_id,transcript_position,gene_id,combined nucleotides,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,sd3,mean3,count,label_percentage,label
0,ENST00000000233,244,ENSG00000004059,AAGACCA,0.00697,3.73,125.0,0.007970,6.650,126.0,0.005980,3.440,80.50,185,0.0,0
1,ENST00000000233,261,ENSG00000004059,CAAACTG,0.00564,2.88,110.0,0.005885,3.000,108.0,0.006790,2.660,94.10,172,0.0,0
2,ENST00000000233,316,ENSG00000004059,GAAACAG,0.00631,2.65,106.0,0.006310,3.780,99.5,0.006310,1.910,89.20,185,0.0,0
3,ENST00000000233,332,ENSG00000004059,AGAACAT,0.00902,5.73,130.0,0.007320,2.635,97.5,0.004980,2.135,89.90,200,0.0,0
4,ENST00000000233,368,ENSG00000004059,AGGACAA,0.00896,6.52,118.0,0.010500,5.660,122.0,0.008695,4.160,85.40,198,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,ENSG00000167747,GGGACAT,0.00817,3.20,118.0,0.005310,4.580,116.0,0.004650,3.640,82.10,73,1.0,0
121834,ENST00000641834,1429,ENSG00000167747,CTGACAC,0.00618,3.69,112.0,0.009600,9.140,116.0,0.004820,3.440,80.60,69,0.0,0
121835,ENST00000641834,1531,ENSG00000167747,TGGACAC,0.00697,3.83,114.0,0.005725,4.440,114.0,0.005790,1.925,84.50,64,1.0,0
121836,ENST00000641834,1537,ENSG00000167747,CTGACCA,0.00660,3.16,110.0,0.006810,5.790,124.0,0.006310,2.330,82.00,57,0.0,0


In [3]:
# ===============================================================================================================================================================
# Using rus to deal with imbalanced dataset, Train and test data on summary_df ==================================================================================
# ===============================================================================================================================================================
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

combined = summary_df
label_encoder = LabelEncoder()
combined['transcript_id'] = label_encoder.fit_transform(combined['transcript_id'])
combined['gene_id'] = label_encoder.fit_transform(combined['gene_id'])
combined = pd.get_dummies(combined, columns=['combined nucleotides'], prefix='nucleotide', drop_first=True)


X = combined.drop('label', axis=1)
y = combined['label']
X_train_summary, X_test_summary, y_train_summary, y_test_summary = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


rus = RandomUnderSampler(random_state=42)
X_usampled_summary, y_usampled_summary = rus.fit_resample(X_train_summary, y_train_summary)
df_usampled_summary = pd.concat([pd.DataFrame(X_usampled_summary, columns=X_train_summary.columns), pd.Series(y_usampled_summary, name='label')], axis=1)

## Baseline model for comparison

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve

# Cross-validation
xgb_model_cv = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(xgb_model_cv, X_usampled_summary, y_usampled_summary, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

# Train and evaluate on the test set
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_usampled_summary, y_usampled_summary)

y_pred_gb = xgb_model.predict_proba(X_test_summary)[:, 1]

threshold = 0.9
y_pred_gb_binary = (y_pred_gb >= threshold).astype(int)

roc_auc_gb_summary = roc_auc_score(y_test_summary, y_pred_gb)
accuracy = accuracy_score(y_test_summary, y_pred_gb_binary)
confusion = confusion_matrix(y_test_summary, y_pred_gb_binary)
classification_report_str = classification_report(y_test_summary, y_pred_gb_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", roc_auc_gb_summary)


Cross-Validation ROC AUC Scores: [0.99092452 0.99246065 0.98608106 0.98779038 0.98490916]
Mean ROC AUC Score: 0.9884331521282924
Standard Deviation of ROC AUC Scores: 0.0028562779208328054

Accuracy_lg: 0.9797275114904793
Confusion Matrix_lg:
 [[23114   305]
 [  189   760]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     23419
           1       0.71      0.80      0.75       949

    accuracy                           0.98     24368
   macro avg       0.85      0.89      0.87     24368
weighted avg       0.98      0.98      0.98     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.991193329599038


## Feature Engineering

### 1. Convert necleotides to 28 features (factors), each feature indicating if a certain position of the nucleotides is a certain letter

In [6]:
# function to convert features
def sequence_to_features(seq):
    """
    Convert a 7-letter sequence into 28 binary features.
    
    Parameters:
    - seq: A string of 7 letters (A, T, C, G)
    
    Returns:
    - A dictionary with 28 binary features.
    """
    letters = ['A', 'T', 'C', 'G']
    features = {}
    
    for i, char in enumerate(seq, start=1):
        for letter in letters:
            feature_name = f"{i}{letter}"
            features[feature_name] = char == letter
            
    return features

# Example
seq = "TTGACAT"
sequence_to_features(seq)


{'1A': False,
 '1T': True,
 '1C': False,
 '1G': False,
 '2A': False,
 '2T': True,
 '2C': False,
 '2G': False,
 '3A': False,
 '3T': False,
 '3C': False,
 '3G': True,
 '4A': True,
 '4T': False,
 '4C': False,
 '4G': False,
 '5A': False,
 '5T': False,
 '5C': True,
 '5G': False,
 '6A': True,
 '6T': False,
 '6C': False,
 '6G': False,
 '7A': False,
 '7T': True,
 '7C': False,
 '7G': False}

In [7]:
new_df = summary_df

feature_dicts = new_df['combined nucleotides'].apply(sequence_to_features)
features_df = feature_dicts.apply(pd.Series)
new_df = pd.concat([new_df, features_df], axis=1)

new_df.drop('combined nucleotides', axis='columns', inplace=True)
new_df

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5C,5G,6A,6T,6C,6G,7A,7T,7C,7G
0,0,244,10,0.00697,3.73,125.0,0.007970,6.650,126.0,0.005980,...,True,False,False,False,True,False,True,False,False,False
1,0,261,10,0.00564,2.88,110.0,0.005885,3.000,108.0,0.006790,...,True,False,False,True,False,False,False,False,False,True
2,0,316,10,0.00631,2.65,106.0,0.006310,3.780,99.5,0.006310,...,True,False,True,False,False,False,False,False,False,True
3,0,332,10,0.00902,5.73,130.0,0.007320,2.635,97.5,0.004980,...,True,False,True,False,False,False,False,True,False,False
4,0,368,10,0.00896,6.52,118.0,0.010500,5.660,122.0,0.008695,...,True,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,5332,1348,2764,0.00817,3.20,118.0,0.005310,4.580,116.0,0.004650,...,True,False,True,False,False,False,False,True,False,False
121834,5332,1429,2764,0.00618,3.69,112.0,0.009600,9.140,116.0,0.004820,...,True,False,True,False,False,False,False,False,True,False
121835,5332,1531,2764,0.00697,3.83,114.0,0.005725,4.440,114.0,0.005790,...,True,False,True,False,False,False,False,False,True,False
121836,5332,1537,2764,0.00660,3.16,110.0,0.006810,5.790,124.0,0.006310,...,True,False,False,False,True,False,True,False,False,False


In [8]:
# train test split and undersampling
new_X = new_df.drop('label', axis=1)
new_y = new_df['label']
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42, stratify=new_y)


rus = RandomUnderSampler(random_state=42)
new_X_usampled, new_y_usampled = rus.fit_resample(new_X_train, new_y_train)
new_df_usampled = pd.concat([pd.DataFrame(new_X_usampled, columns=new_X_train.columns), pd.Series(new_y_usampled, name='label')], axis=1)

### Model

In [10]:
# Cross-validation
new_xgb_model_cv = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(new_xgb_model_cv, new_X_usampled, new_y_usampled, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

# Train and evaluate on the test set
new_xgb_model.fit(new_X_usampled, new_y_usampled)

new_y_pred_gb = new_xgb_model.predict_proba(new_X_test)[:, 1]

threshold = 0.9
new_y_pred_gb_binary = (new_y_pred_gb >= threshold).astype(int)

new_roc_auc_gb = roc_auc_score(new_y_test, new_y_pred_gb)
accuracy = accuracy_score(new_y_test, new_y_pred_gb_binary)
confusion = confusion_matrix(new_y_test, new_y_pred_gb_binary)
classification_report_str = classification_report(new_y_test, new_y_pred_gb_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", new_roc_auc_gb)

Cross-Validation ROC AUC Scores: [0.99177285 0.99294605 0.98519173 0.98842487 0.98540323]
Mean ROC AUC Score: 0.988747747254552
Standard Deviation of ROC AUC Scores: 0.003184722467037067

Accuracy_lg: 0.9793992120814182
Confusion Matrix_lg:
 [[23073   346]
 [  156   793]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     23419
           1       0.70      0.84      0.76       949

    accuracy                           0.98     24368
   macro avg       0.84      0.91      0.87     24368
weighted avg       0.98      0.98      0.98     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9912510583415312


### 2. Standardize all features 

### Standard Scalar

In [11]:
# Standard Scalar
from sklearn.preprocessing import StandardScaler

std_df = new_df.copy()

numeric_cols = std_df.select_dtypes(include=['float64', 'int64']).columns
cols_to_exclude = ['transcript_id', 'gene_id','label', 'label_percentage']
cols_to_scale = [col for col in numeric_cols if col not in cols_to_exclude]

scaler = StandardScaler()
std_df[cols_to_scale] = scaler.fit_transform(std_df[cols_to_scale])
std_df

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5C,5G,6A,6T,6C,6G,7A,7T,7C,7G
0,0,-0.983325,10,0.063341,-0.190594,1.225121,0.685594,0.765774,1.183161,-0.038206,...,True,False,False,False,True,False,True,False,False,False
1,0,-0.971140,10,-0.731625,-0.610858,-0.088154,-0.636286,-0.894011,-0.246699,0.658440,...,True,False,False,True,False,False,False,False,False,True
2,0,-0.931717,10,-0.331154,-0.724576,-0.438361,-0.366838,-0.539317,-0.921911,0.245613,...,True,False,True,False,False,False,False,False,False,True
3,0,-0.920249,10,1.288663,0.798262,1.662880,0.273497,-1.059990,-1.080784,-0.898263,...,True,False,True,False,False,False,False,True,False,False
4,0,-0.894445,10,1.252800,1.188860,0.612259,2.289601,0.315586,0.865414,2.296848,...,True,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,5332,-0.192002,2764,0.780603,-0.452641,0.612259,-1.000833,-0.175529,0.388794,-1.182082,...,True,False,True,False,False,False,False,True,False,False
121834,5332,-0.133943,2764,-0.408857,-0.210371,0.086949,1.719006,1.898066,0.388794,-1.035872,...,True,False,True,False,False,False,False,False,True,False
121835,5332,-0.060832,2764,0.063341,-0.141151,0.262053,-0.737725,-0.239192,0.229921,-0.201617,...,True,False,True,False,False,False,False,False,True,False
121836,5332,-0.056531,2764,-0.157815,-0.472418,-0.088154,-0.049840,0.374701,1.024288,0.245613,...,True,False,False,False,True,False,True,False,False,False


In [13]:
# train test split and undersampling
new_X = std_df.drop('label', axis=1)
new_y = std_df['label']
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42, stratify=new_y)


rus = RandomUnderSampler(random_state=42)
new_X_usampled, new_y_usampled = rus.fit_resample(new_X_train, new_y_train)
new_df_usampled = pd.concat([pd.DataFrame(new_X_usampled, columns=new_X_train.columns), pd.Series(new_y_usampled, name='label')], axis=1)

In [15]:
# Cross-validation
new_xgb_model_cv = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(new_xgb_model_cv, new_X_usampled, new_y_usampled, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

# Train and evaluate on the test set
new_xgb_model.fit(new_X_usampled, new_y_usampled)

new_y_pred_gb = new_xgb_model.predict_proba(new_X_test)[:, 1]

threshold = 0.9
new_y_pred_gb_binary = (new_y_pred_gb >= threshold).astype(int)

new_roc_auc_gb = roc_auc_score(new_y_test, new_y_pred_gb)
accuracy = accuracy_score(new_y_test, new_y_pred_gb_binary)
confusion = confusion_matrix(new_y_test, new_y_pred_gb_binary)
classification_report_str = classification_report(new_y_test, new_y_pred_gb_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", new_roc_auc_gb)

Cross-Validation ROC AUC Scores: [0.99177285 0.99294605 0.98519173 0.98842487 0.98540323]
Mean ROC AUC Score: 0.988747747254552
Standard Deviation of ROC AUC Scores: 0.003184722467037067

Accuracy_lg: 0.9793992120814182
Confusion Matrix_lg:
 [[23073   346]
 [  156   793]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     23419
           1       0.70      0.84      0.76       949

    accuracy                           0.98     24368
   macro avg       0.84      0.91      0.87     24368
weighted avg       0.98      0.98      0.98     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9912510583415312


### MinMax Scalar

In [16]:
# MinMax Scalar
from sklearn.preprocessing import MinMaxScaler

std_df = new_df.copy()

numeric_cols = std_df.select_dtypes(include=['float64', 'int64']).columns
cols_to_exclude = ['transcript_id', 'gene_id','label', 'label_percentage']
cols_to_scale = [col for col in numeric_cols if col not in cols_to_exclude]

scaler = MinMaxScaler()
std_df[cols_to_scale] = scaler.fit_transform(std_df[cols_to_scale])

In [17]:
# train test split and undersampling
new_X = std_df.drop('label', axis=1)
new_y = std_df['label']
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42, stratify=new_y)


rus = RandomUnderSampler(random_state=42)
new_X_usampled, new_y_usampled = rus.fit_resample(new_X_train, new_y_train)
new_df_usampled = pd.concat([pd.DataFrame(new_X_usampled, columns=new_X_train.columns), pd.Series(new_y_usampled, name='label')], axis=1)
new_df_usampled

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5G,6A,6T,6C,6G,7A,7T,7C,7G,label
26369,1008,0.068564,1397,0.147966,0.324866,0.535902,0.426431,0.318821,0.781312,0.215067,...,False,True,False,False,False,False,True,False,False,0
74317,2947,0.080045,2221,0.173885,0.090865,0.527145,0.308243,0.063325,0.443340,0.138113,...,False,False,True,False,False,False,True,False,False,0
87127,3467,0.005820,305,0.177165,0.583781,0.667250,0.289510,0.230431,0.143141,0.282301,...,False,True,False,False,False,False,False,False,True,0
60399,2379,0.030437,1273,0.395013,0.262335,0.343257,0.148161,0.680739,0.582505,0.309032,...,False,False,False,True,False,True,False,False,False,0
42589,1659,0.118332,62,0.359252,0.150953,0.684764,0.258856,0.346526,0.662028,0.201296,...,False,False,False,True,False,False,False,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94067,3791,0.053025,1743,0.334318,0.107474,0.422067,0.163488,0.164468,0.235586,0.147023,...,False,True,False,False,False,False,True,False,False,1
24315,934,0.037059,1978,0.599738,0.291646,0.684764,0.331403,0.516711,0.662028,0.145403,...,False,False,False,True,False,False,False,True,False,1
49428,1945,0.028088,1497,0.300853,0.157792,0.492119,0.174728,0.540018,0.691849,0.121102,...,False,False,True,False,False,False,True,False,False,1
3337,128,0.041918,589,0.305446,0.207621,0.719790,0.335490,0.421724,0.741551,0.174565,...,False,False,True,False,False,False,True,False,False,1


In [19]:
# Cross-validation
new_xgb_model_cv = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(new_xgb_model_cv, new_X_usampled, new_y_usampled, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

# Train and evaluate on the test set
new_xgb_model.fit(new_X_usampled, new_y_usampled)

new_y_pred_gb = new_xgb_model.predict_proba(new_X_test)[:, 1]

threshold = 0.9
new_y_pred_gb_binary = (new_y_pred_gb >= threshold).astype(int)

new_roc_auc_gb = roc_auc_score(new_y_test, new_y_pred_gb)
accuracy = accuracy_score(new_y_test, new_y_pred_gb_binary)
confusion = confusion_matrix(new_y_test, new_y_pred_gb_binary)
classification_report_str = classification_report(new_y_test, new_y_pred_gb_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", new_roc_auc_gb)

Cross-Validation ROC AUC Scores: [0.99177285 0.99294605 0.98519173 0.98842487 0.98540323]
Mean ROC AUC Score: 0.988747747254552
Standard Deviation of ROC AUC Scores: 0.003184722467037067

Accuracy_lg: 0.9793992120814182
Confusion Matrix_lg:
 [[23073   346]
 [  156   793]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     23419
           1       0.70      0.84      0.76       949

    accuracy                           0.98     24368
   macro avg       0.84      0.91      0.87     24368
weighted avg       0.98      0.98      0.98     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9912510583415312


## Hyperparameter Tuning via GridSearchCV

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.5, 0.8, 0.9, 1]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=5, 
                           verbose=0,
                           n_jobs=-1) 

grid_search.fit(new_X_usampled, new_y_usampled)

best_xgb_model = grid_search.best_estimator_
best_index = grid_search.best_index_
std_roc_auc = grid_search.cv_results_['std_test_score'][best_index]

In [24]:
print("Best Hyperparameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)
print("Standard Deviation of ROC AUC Score:", std_roc_auc)

Best Hyperparameters: {'gamma': 0.1, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1}
Best ROC AUC Score: 0.9894421708655872
Standard Deviation of ROC AUC Score: 0.0030810583625889023
