In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Read the dataset directly
file_path = 'df.csv'
df = pd.read_csv(file_path)

In [2]:
def get_summary_df(df):
    count_df1 = df.groupby(['transcript_id', 'transcript_position']).size().reset_index(name='read_count')
    count_df2 = df.groupby(['transcript_id']).size().reset_index(name='expression_count') #most genes are likely to produce unique transcripts
    df = df.merge(count_df1, on=['transcript_id', 'transcript_position'])
    df = df.merge(count_df2, on=['transcript_id'])
    
    summary_df = df.groupby(['transcript_id', 'transcript_position']).agg({
        'gene_id': 'first',
        'combined nucleotides': 'first',
        'dwelling_time1': ['mean'],
        'sd1': ['mean'],
        'mean1': ['mean'],
        'dwelling_time2': ['mean'],
        'sd2': ['mean'],
        'mean2': ['mean'],
        'dwelling_time3': ['mean'],
        'sd3': ['mean'],
        'mean3': ['mean'],
        'label': lambda x: x.mode()[0] if not x.mode().empty else None
    }).reset_index()
    summary_df.columns = summary_df.columns.map('_'.join)
    summary_df = summary_df.rename(columns={'label_<lambda>': 'label',
                                           'transcript_id_': 'transcript_id',
                                           'transcript_position_': 'transcript_position',
                                           'gene_id_first': 'gene_id',
                                           'combined nucleotides_first': 'combined nucleotides'})
    
    return summary_df

In [3]:
summary_df = get_summary_df(df)

In [4]:
summary_df

Unnamed: 0,transcript_id,transcript_position,gene_id,combined nucleotides,dwelling_time1_mean,sd1_mean,mean1_mean,dwelling_time2_mean,sd2_mean,mean2_mean,dwelling_time3_mean,sd3_mean,mean3_mean,label
0,ENST00000000233,244,ENSG00000004059,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270,0
1,ENST00000000233,261,ENSG00000004059,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698,0
2,ENST00000000233,316,ENSG00000004059,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324,0
3,ENST00000000233,332,ENSG00000004059,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006101,2.236520,89.154000,0
4,ENST00000000233,368,ENSG00000004059,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,ENSG00000167747,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,4.005616,82.004110,1
121834,ENST00000641834,1429,ENSG00000167747,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,3.644638,80.497101,0
121835,ENST00000641834,1531,ENSG00000167747,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759688,113.562500,0.006410,2.181563,84.190625,1
121836,ENST00000641834,1537,ENSG00000167747,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,2.540877,82.289474,0


In [5]:
X = summary_df.drop(['label','transcript_id','combined nucleotides','gene_id'], axis=1)
y = summary_df['label']

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score

# Create LogisticRegression instance
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)

# Define scorers
scorers = {
    'roc_auc': 'roc_auc',  # Uses the ROC AUC scoring
    'pr_auc': make_scorer(average_precision_score)  # Uses the Precision-Recall AUC scoring
}

# Cross-validation with multiple scoring metrics
cv_results = cross_validate(log_reg_model, X, y, cv=5, scoring=scorers)

# Calculate averages of ROC AUC and PR AUC for each fold
average_scores = (cv_results['test_roc_auc'] + cv_results['test_pr_auc']) / 2

print("Cross-Validation ROC AUC Scores:", cv_results['test_roc_auc'])
print("Mean ROC AUC Score:", cv_results['test_roc_auc'].mean())
print("Standard Deviation of ROC AUC Scores:", cv_results['test_roc_auc'].std())
print('-' * 100)
print("Cross-Validation PR AUC Scores:", cv_results['test_pr_auc'])
print("Mean PR AUC Score:", cv_results['test_pr_auc'].mean())
print("Standard Deviation of PR AUC Scores:", cv_results['test_pr_auc'].std())
print('-' * 100)
print("Average of ROC AUC and PR AUC Scores for Each Fold:", average_scores)
print("Mean of Average Scores:", average_scores.mean())
print("Standard Deviation of Average Scores:", average_scores.std())

Cross-Validation ROC AUC Scores: [0.68612394 0.66941754 0.66690584 0.70394234 0.64875752]
Mean ROC AUC Score: 0.6750294348682209
Standard Deviation of ROC AUC Scores: 0.018690875307401118
----------------------------------------------------------------------------------------------------
Cross-Validation PR AUC Scores: [0.04493598 0.04493598 0.04493598 0.04493783 0.04493783]
Mean PR AUC Score: 0.04493671926831497
Standard Deviation of PR AUC Scores: 9.034368288930283e-07
----------------------------------------------------------------------------------------------------
Average of ROC AUC and PR AUC Scores for Each Fold: [0.36552996 0.35717676 0.35592091 0.37444008 0.34684767]
Mean of Average Scores: 0.359983077068268
Standard Deviation of Average Scores: 0.009345463721837399


### XGBoost

In [10]:
import xgboost as xgb
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score

# Create XGBClassifier instance
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Define scorers
scorers = {
    'roc_auc': 'roc_auc',  # Uses the ROC AUC scoring
    'pr_auc': make_scorer(average_precision_score)  # Uses the Precision-Recall AUC scoring
}

# Cross-validation with multiple scoring metrics
cv_results = cross_validate(xgb_model, X, y, cv=5, scoring=scorers)

# Calculate averages of ROC AUC and PR AUC for each fold
average_scores = (cv_results['test_roc_auc'] + cv_results['test_pr_auc']) / 2

print("Cross-Validation ROC AUC Scores:", cv_results['test_roc_auc'])
print("Mean ROC AUC Score:", cv_results['test_roc_auc'].mean())
print("Standard Deviation of ROC AUC Scores:", cv_results['test_roc_auc'].std())
print('-' * 100)
print("Cross-Validation PR AUC Scores:", cv_results['test_pr_auc'])
print("Mean PR AUC Score:", cv_results['test_pr_auc'].mean())
print("Standard Deviation of PR AUC Scores:", cv_results['test_pr_auc'].std())
print('-' * 100)
print("Average of ROC AUC and PR AUC Scores for Each Fold:", average_scores)
print("Mean of Average Scores:", average_scores.mean())
print("Standard Deviation of Average Scores:", average_scores.std())

Cross-Validation ROC AUC Scores: [0.89004002 0.87733013 0.87966446 0.88480393 0.88732616]
Mean ROC AUC Score: 0.8838329401047893
Standard Deviation of ROC AUC Scores: 0.004718812695889388
----------------------------------------------------------------------------------------------------
Cross-Validation PR AUC Scores: [0.22111677 0.21106019 0.19523733 0.20722406 0.1636434 ]
Mean PR AUC Score: 0.19965635108707397
Standard Deviation of PR AUC Scores: 0.019820837290330188
----------------------------------------------------------------------------------------------------
Average of ROC AUC and PR AUC Scores for Each Fold: [0.5555784  0.54419516 0.5374509  0.546014   0.52548478]
Mean of Average Scores: 0.5417446455959316
Standard Deviation of Average Scores: 0.009984233545896618
