# Analysis of predictions by DNABERT

In [1]:
import pandas as pd

  from pandas.core import (


In [2]:
df_sequences=pd.read_csv("../DNABERT/examples/dhs2/heart_specific_dhs_sequences.tsv", sep="\t")[["dhs_id","sequence"]]
df_sequences

Unnamed: 0,dhs_id,sequence
0,chr1_181400_181564_181490,cgcccaggggaggaggcgtggcgcaggcgcagagaggcgcgccgtg...
1,chr1_629160_629310_629230,CACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTA...
2,chr1_629520_629596_629590,ccatccctgagaatccaaaattctccgtgccacctatcacacccca...
3,chr1_629870_630020_629930,CAATATACTCTCCGGACAATGAACCATAACCAATACCACCAATCAA...
4,chr1_630181_630319_630270,ACTCCTCAATTACCCACATAGGATGAATAACAGCAGTTCTACCGTA...
...,...,...
93901,chrY_19354160_19354375_19354270,tgtgagctgttctgaaaaacttgtgactatgcgtggcctgggacct...
93902,chrY_19567050_19567360_19567210,CTAGAACGTTGCATTACGCTCCAATCCCGAGCAGGTCCAGAGAGCC...
93903,chrY_19744660_19745060_19744810,TAGCTGCTTACCAATCGTCAGGGATCCTAGTTTTACAGCCACCATC...
93904,chrY_20575532_20575800_20575670,GGGCCCCGCCCATTTCATCCTTGACTCCACCTTCTCCATGCTGAGT...


In [3]:
df_kmers=pd.read_csv("../DNABERT/examples/dhs2/ft/6/heart_specific_dhs_6mers_with_dhs_ids.tsv", sep="\t").rename(columns={'sequence': 'kmers'})
df_kmers

Unnamed: 0,dhs_id,sequence_6mers,label
0,chr1_181400_181564_181490,cgccca gcccag cccagg ccaggg cagggg agggga gggg...,0
1,chr1_629160_629310_629230,CACAAA ACAAAC CAAACA AAACAT AACATT ACATTA CATT...,0
2,chr1_629520_629596_629590,ccatcc catccc atccct tccctg ccctga cctgag ctga...,0
3,chr1_629870_630020_629930,CAATAT AATATA ATATAC TATACT ATACTC TACTCT ACTC...,0
4,chr1_630181_630319_630270,ACTCCT CTCCTC TCCTCA CCTCAA CTCAAT TCAATT CAAT...,0
...,...,...,...
93901,chrY_19354160_19354375_19354270,tgtgag gtgagc tgagct gagctg agctgt gctgtt ctgt...,1
93902,chrY_19567050_19567360_19567210,CTAGAA TAGAAC AGAACG GAACGT AACGTT ACGTTG CGTT...,0
93903,chrY_19744660_19745060_19744810,TAGCTG AGCTGC GCTGCT CTGCTT TGCTTA GCTTAC CTTA...,0
93904,chrY_20575532_20575800_20575670,GGGCCC GGCCCC GCCCCG CCCCGC CCCGCC CCGCCC CGCC...,0


In [4]:
import numpy as np

array = np.load("../DNABERT/examples/dhs2/result/6/pred_results.npy")
# Convert the numpy array to a pandas DataFrame with a single column named 'Prediction'
df_preds = pd.DataFrame(array, columns=['preds'])
df_preds

#np.savetxt('./dhs/result/6/pred_results.tsv', array, delimiter='\t')


Unnamed: 0,preds
0,0.005358
1,0.002742
2,0.000048
3,0.073776
4,0.063507
...,...
16165,0.016648
16166,0.003751
16167,0.004595
16168,0.000902


In [None]:
df_preds_final=pd.merge(df_sequences, df_kmers, on="dhs_id")


In [None]:
df_combined = pd.concat([df_preds_final, df_preds], axis=1)
df_combined

In [None]:
df_combined.columns

# Analysis of results 

## 1. Accuracy and Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df_combined is your DataFrame with 'label' and 'preds' columns
cm = confusion_matrix(df_combined['label'], df_combined['preds'].round())

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='cividis', cbar=False, square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## 2. ROC Curve and AUC for Model Performance

In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(df_combined['label'], df_combined['preds'])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

## 3. Analysis of Top Predicted DHSs

In [None]:
# Filter df_combined for entries where preds >= 0.9 and label == 1
top_predicted_dhs = df_combined[(df_combined['preds'] >= 0.9) & (df_combined['label'] == 1)]

print("Top Predicted DHSs with preds >= 0.9 and label == 1:")
print(top_predicted_dhs[['dhs_id', 'label', 'preds']])

## 4. Visualize the Genomic Distribution of Top Predicted DHSs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Split 'dhs_id' into 'Chr', 'start', 'end', 'summit'
top_predicted_dhs[['Chr', 'start', 'end', 'summit']] = top_predicted_dhs['dhs_id'].str.split('_', expand=True)

# Count the number of DHSs per chromosome ('Chr')
dhs_per_chr = top_predicted_dhs['Chr'].value_counts()

plt.figure(figsize=(12, 8))
dhs_per_chr.plot(kind='bar')
plt.title('Distribution of Highly Confident True Positive DHSs Across Chromosomes')
plt.xlabel('Chromosome')
plt.ylabel('Number of DHSs')
plt.xticks(rotation=45)
plt.show()


## 5. Prepare data for Functional Enrichment Analysis

In [None]:
# Remove 'chr' prefix from the 'Chr' column
top_hits=top_predicted_dhs.copy()

In [None]:
top_hits[['Chr', 'start', 'end']].to_csv('../DNABERT/examples/dhs/top_dhs_regions.txt', sep='\t', index=False, header=False)
top_hits[['Chr', 'start', 'end']]

## 6. Submit the top results to the [GREAT Functional Enrichment site](https://great.stanford.edu/great/public/html/)

## 7. Analyse results from GREAT

In [None]:
import pandas as pd

# Replace 'great_output.txt' with the path to your GREAT output file
great_output_df = pd.read_csv('../DNABERT/examples/dhs2/greatExportAll.tsv', skiprows=3, sep='\t')  # Adjust separator if needed

# Display the first few rows to verify
print(great_output_df)

In [None]:
# Define heart-related keywords
heart_keywords = ['heart', 'cardiac', 'cardiovascular', 'atrium', 'ventricle', 'myocardial', 'coronary']

# Filter rows where any of the heart_keywords appear in the 'Desc' column
heart_related_rows = great_output_df[great_output_df['Desc'].str.contains('|'.join(heart_keywords), case=False, na=False)]

# Display heart-related rows
print(heart_related_rows)

In [None]:
heart_related_rows["Desc"].unique()