In [1]:
import pandas as pd

df = pd.read_pickle('df.pkl')

agg_functions = {
    'gene_id': 'first',
    'combined nucleotides': 'first',
    'dwelling_time1': 'mean',
    'sd1': 'mean',
    'mean1': 'mean',
    'dwelling_time2': 'mean',
    'sd2': 'mean',
    'mean2': 'mean',
    'dwelling_time3': 'mean',
    'sd3': 'mean',
    'mean3': 'mean',
    'label': 'sum'
}

summary_df = df.groupby(['transcript_id', 'transcript_position']).agg(agg_functions).reset_index()
summary_df['count'] = df.groupby(['transcript_id', 'transcript_position']).size().reset_index(name='count')['count']
summary_df['label_percentage'] = summary_df['label'] / summary_df['count']
summary_df = summary_df.rename(columns={'label': 'label_sum'})
summary_df['label'] = summary_df['label_percentage'].apply(lambda x: 0 if x == 0 else 1)
summary_df.drop('label_percentage', axis=1, inplace=True)
summary_df.drop('label_sum', axis=1, inplace=True)

### Tokenization of sequences and is_DRACH feature

In [2]:
# function to convert features
def sequence_to_features(seq):
    """
    Convert a 7-letter sequence into 28 binary features.
    
    Parameters:
    - seq: A string of 7 letters (A, T, C, G)
    
    Returns:
    - A dictionary with 28 binary features.
    """
    letters = ['A', 'T', 'C', 'G']
    features = {}
    
    for i, char in enumerate(seq, start=1):
        for letter in letters:
            feature_name = f"{i}{letter}"
            features[feature_name] = char == letter
            
    return features

In [3]:
new_df = summary_df.copy()

feature_dicts = new_df['combined nucleotides'].apply(sequence_to_features)
features_df = feature_dicts.apply(pd.Series)
new_df = pd.concat([new_df, features_df], axis=1)

In [4]:
def is_drach_motif(sequence):
    # Extract middle 5 nucleotides
    motif = sequence[1:6]
    
    # Check the DRACH rules
    D = motif[0] in ['A', 'G', 'T']
    R = motif[1] in ['A', 'G']
    A = motif[2] == 'A'
    C = motif[3] == 'C'
    H = motif[4] in ['A', 'C', 'T']

    return D and R and A and C and H

In [5]:
# Create a new column indicating whether the sequence satisfies the DRACH motif
new_df['is_DRACH'] = new_df['combined nucleotides'].apply(is_drach_motif)

new_df.drop('combined nucleotides', axis='columns', inplace=True)
new_df

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5G,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH
0,ENST00000000233,244,ENSG00000004059,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,...,False,False,False,True,False,True,False,False,False,True
1,ENST00000000233,261,ENSG00000004059,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,...,False,False,True,False,False,False,False,False,True,True
2,ENST00000000233,316,ENSG00000004059,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,...,False,True,False,False,False,False,False,False,True,True
3,ENST00000000233,332,ENSG00000004059,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006101,...,False,True,False,False,False,False,True,False,False,True
4,ENST00000000233,368,ENSG00000004059,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,...,False,True,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,ENSG00000167747,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,...,False,True,False,False,False,False,True,False,False,True
121834,ENST00000641834,1429,ENSG00000167747,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,...,False,True,False,False,False,False,False,True,False,True
121835,ENST00000641834,1531,ENSG00000167747,0.008161,3.918438,113.968750,0.006877,4.759688,113.562500,0.006410,...,False,True,False,False,False,False,False,True,False,True
121836,ENST00000641834,1537,ENSG00000167747,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,...,False,False,False,True,False,True,False,False,False,True


### Label Encoder

In [6]:
from sklearn.preprocessing import LabelEncoder

combined = new_df.copy()

label_encoder = LabelEncoder()
combined['transcript_id'] = label_encoder.fit_transform(combined['transcript_id'])
combined['gene_id'] = label_encoder.fit_transform(combined['gene_id'])
combined

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5G,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH
0,0,244,10,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,...,False,False,False,True,False,True,False,False,False,True
1,0,261,10,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,...,False,False,True,False,False,False,False,False,True,True
2,0,316,10,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,...,False,True,False,False,False,False,False,False,True,True
3,0,332,10,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006101,...,False,True,False,False,False,False,True,False,False,True
4,0,368,10,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,...,False,True,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,5332,1348,2764,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,...,False,True,False,False,False,False,True,False,False,True
121834,5332,1429,2764,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,...,False,True,False,False,False,False,False,True,False,True
121835,5332,1531,2764,0.008161,3.918438,113.968750,0.006877,4.759688,113.562500,0.006410,...,False,True,False,False,False,False,False,True,False,True
121836,5332,1537,2764,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,...,False,False,False,True,False,True,False,False,False,True


### MinMax Scalar

In [7]:
# MinMax Scalar
from sklearn.preprocessing import MinMaxScaler

numeric_cols = combined.select_dtypes(include=['float64', 'int64']).columns
cols_to_exclude = ['label', 'transcript_id', 'gene_id']
cols_to_scale = [col for col in numeric_cols if col not in cols_to_exclude]

scaler = MinMaxScaler()
combined[cols_to_scale] = scaler.fit_transform(combined[cols_to_scale])
combined

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5G,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH
0,0,0.012869,10,0.299828,0.278362,0.774965,0.313303,0.555867,0.776087,0.297366,...,False,False,False,True,False,True,False,False,False,True
1,0,0.013777,10,0.193061,0.180401,0.524207,0.169286,0.167033,0.415957,0.325895,...,False,False,True,False,False,False,False,False,True,True
2,0,0.016714,10,0.255036,0.153573,0.448991,0.203182,0.205973,0.237280,0.313785,...,False,True,False,False,False,False,False,False,True,True
3,0,0.017568,10,0.451798,0.497412,0.876051,0.271625,0.136405,0.215091,0.200279,...,False,True,False,False,False,False,True,False,False,True
4,0,0.019491,10,0.456975,0.491451,0.671622,0.431763,0.414405,0.696985,0.506161,...,False,True,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,5332,0.071821,2764,0.385587,0.187961,0.677142,0.196661,0.326397,0.584852,0.235651,...,False,True,False,False,False,False,True,False,False,True
121834,5332,0.076147,2764,0.308109,0.306293,0.547244,0.365725,0.717144,0.556581,0.158586,...,False,True,False,False,False,False,False,True,False,True
121835,5332,0.081593,2764,0.293135,0.248668,0.600882,0.172875,0.310487,0.529307,0.224355,...,False,True,False,False,False,False,False,True,False,True
121836,5332,0.081914,2764,0.285603,0.177951,0.518359,0.203405,0.478282,0.723132,0.229216,...,False,False,False,True,False,True,False,False,False,True


### Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

X = combined.drop('label', axis=1)
y = combined['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
X_train

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,5G,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH
111580,4783,0.133604,3256,0.333251,0.655443,0.722797,0.169705,0.121075,0.205585,0.188254,...,False,True,False,False,False,False,True,False,False,True
69119,2740,0.017194,518,0.296290,0.362077,0.522274,0.182752,0.173370,0.250714,0.556151,...,False,False,True,False,False,False,False,True,False,True
26583,1018,0.043734,2148,0.244116,0.198991,0.655945,0.243344,0.538043,0.612262,0.186795,...,False,True,False,False,False,False,False,True,False,True
25994,995,0.009932,2117,0.194738,0.314576,0.804979,0.253657,0.199928,0.957641,0.151231,...,False,False,True,False,False,False,True,False,False,True
97869,3991,0.022267,3615,0.338162,0.206187,0.740855,0.193846,0.730254,0.644763,0.343725,...,False,False,False,True,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55198,2176,0.040850,3306,0.232459,0.274114,0.401101,0.231302,0.447584,0.731478,0.270559,...,False,False,True,False,False,False,False,True,False,True
66396,2627,0.044855,2980,0.425619,0.191521,0.500063,0.296932,0.163256,0.401007,0.182271,...,False,False,True,False,False,False,True,False,False,True
8813,351,0.070540,169,0.279533,0.220955,0.451259,0.222990,0.648586,0.618464,0.232377,...,False,False,False,True,False,False,True,False,False,True
79926,3168,0.026860,654,0.253483,0.188945,0.766777,0.314980,0.531490,0.789641,0.301237,...,False,False,False,True,False,False,True,False,False,True


### Random Undersampling 

In [10]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_usampled, y_usampled = rus.fit_resample(X_train, y_train)

In [11]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve

xgb_model_cv = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(xgb_model_cv, X_usampled, y_usampled, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

Cross-Validation ROC AUC Scores: [0.90090516 0.89890093 0.89867288 0.90990597 0.89655918]
Mean ROC AUC Score: 0.9009888242530391
Standard Deviation of ROC AUC Scores: 0.00466619803590722


In [12]:
# Train and evaluate on the test set
xgb_model_cv.fit(X_usampled, y_usampled)

y_pred = xgb_model_cv.predict_proba(X_test)[:, 1]

threshold = 0.9
y_pred_binary = (y_pred >= threshold).astype(int)

roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred_binary)
confusion = confusion_matrix(y_test, y_pred_binary)
classification_report_str = classification_report(y_test, y_pred_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", roc_auc)


Accuracy_lg: 0.9497701904136573
Confusion Matrix_lg:
 [[22568   705]
 [  519   576]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97     23273
           1       0.45      0.53      0.48      1095

    accuracy                           0.95     24368
   macro avg       0.71      0.75      0.73     24368
weighted avg       0.95      0.95      0.95     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9116487308573029


### Undersampling with Tomek Links

In [13]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
X_resampled_tl, y_resampled_tl = tl.fit_resample(X_train, y_train)

In [14]:
df_tl = pd.concat([pd.DataFrame(X_resampled_tl, columns=X_train.columns), pd.Series(y_resampled_tl, name='label')], axis=1)
df_tl

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH,label
0,4783,0.133604,3256,0.333251,0.655443,0.722797,0.169705,0.121075,0.205585,0.188254,...,True,False,False,False,False,True,False,False,True,0
1,2740,0.017194,518,0.296290,0.362077,0.522274,0.182752,0.173370,0.250714,0.556151,...,False,True,False,False,False,False,True,False,True,0
2,1018,0.043734,2148,0.244116,0.198991,0.655945,0.243344,0.538043,0.612262,0.186795,...,True,False,False,False,False,False,True,False,True,0
3,995,0.009932,2117,0.194738,0.314576,0.804979,0.253657,0.199928,0.957641,0.151231,...,False,True,False,False,False,True,False,False,True,0
4,3991,0.022267,3615,0.338162,0.206187,0.740855,0.193846,0.730254,0.644763,0.343725,...,False,False,True,False,False,False,False,True,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95739,2176,0.040850,3306,0.232459,0.274114,0.401101,0.231302,0.447584,0.731478,0.270559,...,False,True,False,False,False,False,True,False,True,0
95740,2627,0.044855,2980,0.425619,0.191521,0.500063,0.296932,0.163256,0.401007,0.182271,...,False,True,False,False,False,True,False,False,True,0
95741,351,0.070540,169,0.279533,0.220955,0.451259,0.222990,0.648586,0.618464,0.232377,...,False,False,True,False,False,True,False,False,True,0
95742,3168,0.026860,654,0.253483,0.188945,0.766777,0.314980,0.531490,0.789641,0.301237,...,False,False,True,False,False,True,False,False,True,0


In [15]:
xgb_model_tl = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(xgb_model_tl, X_resampled_tl, y_resampled_tl, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

Cross-Validation ROC AUC Scores: [0.91849141 0.91587671 0.91458806 0.91286024 0.91867301]
Mean ROC AUC Score: 0.9160978864886179
Standard Deviation of ROC AUC Scores: 0.002243703850765357


In [16]:
# Train and evaluate on the test set
xgb_model_cv.fit(X_resampled_tl, y_resampled_tl)

y_pred = xgb_model_cv.predict_proba(X_test)[:, 1]

threshold = 0.1
y_pred_binary = (y_pred >= threshold).astype(int)

roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred_binary)
confusion = confusion_matrix(y_test, y_pred_binary)
classification_report_str = classification_report(y_test, y_pred_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", roc_auc)


Accuracy_lg: 0.9274868680236376
Confusion Matrix_lg:
 [[21853  1420]
 [  347   748]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96     23273
           1       0.35      0.68      0.46      1095

    accuracy                           0.93     24368
   macro avg       0.66      0.81      0.71     24368
weighted avg       0.96      0.93      0.94     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9201356854818535


### Oversampling with SMOTE 

In [17]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

In [18]:
df_smote = pd.concat([pd.DataFrame(X_resampled_smote, columns=X_train.columns), pd.Series(y_resampled_smote, name='label')], axis=1)
df_smote

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH,label
0,4783,0.133604,3256,0.333251,0.655443,0.722797,0.169705,0.121075,0.205585,0.188254,...,True,False,False,False,False,True,False,False,True,0
1,2740,0.017194,518,0.296290,0.362077,0.522274,0.182752,0.173370,0.250714,0.556151,...,False,True,False,False,False,False,True,False,True,0
2,1018,0.043734,2148,0.244116,0.198991,0.655945,0.243344,0.538043,0.612262,0.186795,...,True,False,False,False,False,False,True,False,True,0
3,995,0.009932,2117,0.194738,0.314576,0.804979,0.253657,0.199928,0.957641,0.151231,...,False,True,False,False,False,True,False,False,True,0
4,3991,0.022267,3615,0.338162,0.206187,0.740855,0.193846,0.730254,0.644763,0.343725,...,False,False,True,False,False,False,False,True,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186175,957,0.027106,2026,0.373112,0.270092,0.691555,0.263800,0.452150,0.719758,0.279769,...,False,True,False,False,True,False,False,True,True,1
186176,988,0.073877,2109,0.296978,0.146665,0.669301,0.266775,0.290995,0.633450,0.188372,...,False,False,True,False,False,False,True,False,True,1
186177,3871,0.074154,169,0.272188,0.188389,0.678554,0.165045,0.271283,0.609799,0.340097,...,False,True,False,False,False,False,True,True,True,1
186178,2851,0.076158,1258,0.378065,0.272735,0.576292,0.214564,0.207230,0.494933,0.253926,...,False,True,False,False,False,False,False,True,True,1


In [19]:
xgb_model_smote = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(xgb_model_smote, X_resampled_smote, y_resampled_smote, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

Cross-Validation ROC AUC Scores: [0.97701138 0.99971798 0.999756   0.99973645 0.99981889]
Mean ROC AUC Score: 0.9952081422145194
Standard Deviation of ROC AUC Scores: 0.009098442996712997


In [20]:
# Train and evaluate on the test set
xgb_model_smote.fit(X_resampled_smote, y_resampled_smote)
y_pred = xgb_model_smote.predict_proba(X_test)[:, 1]

threshold = 0.4
y_pred_binary = (y_pred >= threshold).astype(int)

roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred_binary)
confusion = confusion_matrix(y_test, y_pred_binary)
classification_report_str = classification_report(y_test, y_pred_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", roc_auc)


Accuracy_lg: 0.959413985554826
Confusion Matrix_lg:
 [[22887   386]
 [  603   492]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98     23273
           1       0.56      0.45      0.50      1095

    accuracy                           0.96     24368
   macro avg       0.77      0.72      0.74     24368
weighted avg       0.96      0.96      0.96     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9188104388117455


### Oversampling with ADASYN

In [21]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN()
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_train, y_train)

In [22]:
df_adasyn = pd.concat([pd.DataFrame(X_resampled_adasyn, columns=X_train.columns), pd.Series(y_resampled_adasyn, name='label')], axis=1)
df_adasyn

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH,label
0,4783,0.133604,3256,0.333251,0.655443,0.722797,0.169705,0.121075,0.205585,0.188254,...,True,False,False,False,False,True,False,False,True,0
1,2740,0.017194,518,0.296290,0.362077,0.522274,0.182752,0.173370,0.250714,0.556151,...,False,True,False,False,False,False,True,False,True,0
2,1018,0.043734,2148,0.244116,0.198991,0.655945,0.243344,0.538043,0.612262,0.186795,...,True,False,False,False,False,False,True,False,True,0
3,995,0.009932,2117,0.194738,0.314576,0.804979,0.253657,0.199928,0.957641,0.151231,...,False,True,False,False,False,True,False,False,True,0
4,3991,0.022267,3615,0.338162,0.206187,0.740855,0.193846,0.730254,0.644763,0.343725,...,False,False,True,False,False,False,False,True,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185457,4559,0.126832,67,0.480170,0.197537,0.658435,0.234225,0.338659,0.536705,0.245277,...,False,True,False,False,False,False,True,True,True,1
185458,4551,0.136186,78,0.504547,0.157091,0.636851,0.252826,0.379422,0.579297,0.258087,...,False,True,False,False,False,False,True,True,True,1
185459,4605,0.087422,11,0.280622,0.180997,0.671132,0.299898,0.554888,0.533289,0.393048,...,True,True,False,False,False,True,False,True,True,1
185460,4605,0.077386,11,0.339460,0.388474,0.762957,0.153519,0.170638,0.333094,0.202242,...,True,True,False,False,False,True,False,True,True,1


In [23]:
xgb_model_adasyn = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(xgb_model_adasyn, X_resampled_adasyn, y_resampled_adasyn, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

Cross-Validation ROC AUC Scores: [0.97616547 0.99960861 0.99966042 0.99959704 0.99952506]
Mean ROC AUC Score: 0.9949113201713095
Standard Deviation of ROC AUC Scores: 0.009373023331951202


In [24]:
# Train and evaluate on the test set
xgb_model_adasyn.fit(X_resampled_adasyn, y_resampled_adasyn)
y_pred = xgb_model_adasyn.predict_proba(X_test)[:, 1]

threshold = 0.4
y_pred_binary = (y_pred >= threshold).astype(int)

roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred_binary)
confusion = confusion_matrix(y_test, y_pred_binary)
classification_report_str = classification_report(y_test, y_pred_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", roc_auc)


Accuracy_lg: 0.9602757715036113
Confusion Matrix_lg:
 [[22907   366]
 [  602   493]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98     23273
           1       0.57      0.45      0.50      1095

    accuracy                           0.96     24368
   macro avg       0.77      0.72      0.74     24368
weighted avg       0.96      0.96      0.96     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.9193587842693839


### Combination of ADASYN and Tomek Links

In [25]:
from imblearn.pipeline import Pipeline

# Define the resampling pipeline
resampling = Pipeline([
    ('adasyn', ADASYN()),
    ('tomek', TomekLinks())
])

X_resampled_hybrid, y_resampled_hybrid = resampling.fit_resample(X_train, y_train)

In [26]:
df_hybrid = pd.concat([pd.DataFrame(X_resampled_hybrid, columns=X_train.columns), pd.Series(y_resampled_hybrid, name='label')], axis=1)
df_hybrid

Unnamed: 0,transcript_id,transcript_position,gene_id,dwelling_time1,sd1,mean1,dwelling_time2,sd2,mean2,dwelling_time3,...,6A,6T,6C,6G,7A,7T,7C,7G,is_DRACH,label
0,4783,0.133604,3256,0.333251,0.655443,0.722797,0.169705,0.121075,0.205585,0.188254,...,True,False,False,False,False,True,False,False,True,0
1,2740,0.017194,518,0.296290,0.362077,0.522274,0.182752,0.173370,0.250714,0.556151,...,False,True,False,False,False,False,True,False,True,0
2,1018,0.043734,2148,0.244116,0.198991,0.655945,0.243344,0.538043,0.612262,0.186795,...,True,False,False,False,False,False,True,False,True,0
3,995,0.009932,2117,0.194738,0.314576,0.804979,0.253657,0.199928,0.957641,0.151231,...,False,True,False,False,False,True,False,False,True,0
4,3991,0.022267,3615,0.338162,0.206187,0.740855,0.193846,0.730254,0.644763,0.343725,...,False,False,True,False,False,False,False,True,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184634,4578,0.105942,44,0.425728,0.287867,0.706637,0.192684,0.247621,0.441585,0.216669,...,False,True,False,False,False,False,True,True,True,1
184635,4602,0.079307,13,0.355439,0.411854,0.771281,0.136407,0.125375,0.319784,0.178197,...,False,True,False,False,False,True,False,True,True,1
184636,4573,0.111453,50,0.440090,0.264037,0.693921,0.203643,0.271638,0.466679,0.224216,...,False,True,False,False,False,False,True,True,True,1
184637,4604,0.076518,11,0.348905,0.416508,0.775041,0.133642,0.118406,0.307524,0.176054,...,False,True,False,False,False,True,False,True,True,1


In [27]:
xgb_model_hybrid = xgb.XGBClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(xgb_model_hybrid, X_resampled_hybrid, y_resampled_hybrid, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", cv_scores.mean())
print("Standard Deviation of ROC AUC Scores:", cv_scores.std())

Cross-Validation ROC AUC Scores: [0.97630728 0.99954014 0.99968281 0.99959748 0.99962867]
Mean ROC AUC Score: 0.9949512767388538
Standard Deviation of ROC AUC Scores: 0.00932211196828446


In [28]:
# Train and evaluate on the test set
xgb_model_hybrid.fit(X_resampled_hybrid, y_resampled_hybrid)
y_pred = xgb_model_hybrid.predict_proba(X_test)[:, 1]

threshold = 0.4
y_pred_binary = (y_pred >= threshold).astype(int)

roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred_binary)
confusion = confusion_matrix(y_test, y_pred_binary)
classification_report_str = classification_report(y_test, y_pred_binary)

print("\nAccuracy_lg:", accuracy)
print("Confusion Matrix_lg:\n", confusion)
print("Classification Report_lg:\n", classification_report_str)
print("ROC AUC Score of Gradient Boosting (XGBoost) on Test Set:", roc_auc)


Accuracy_lg: 0.9600705843729481
Confusion Matrix_lg:
 [[22886   387]
 [  586   509]]
Classification Report_lg:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     23273
           1       0.57      0.46      0.51      1095

    accuracy                           0.96     24368
   macro avg       0.77      0.72      0.75     24368
weighted avg       0.96      0.96      0.96     24368

ROC AUC Score of Gradient Boosting (XGBoost) on Test Set: 0.920086517250966
