In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score



# Train Organism Promoter Data

In [2]:
import os
import pandas as pd

# Define the directory where your .txt files are located
directory = "D:\OHE Data\Promoter data\Training Data"

# Initialize an empty list to store DataFrames from each file
dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(directory, filename), "r") as file:
            sequence_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        sequence_data = sequence_data.replace('\n', '')

        # Split the sequence data into dinucleotides
        dinucleotides = [sequence_data[i:i+2] for i in range(0, len(sequence_data), 2)]

        # Split dinucleotides into rows of 50 columns
        rows = [dinucleotides[i:i+50] for i in range(0, len(dinucleotides), 50)]

        # Create a DataFrame for the current file
        df = pd.DataFrame(rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        df.columns = column_names

        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Now, combined_df contains the data with dinucleotides in each row and columns starting from -80
combined_df


Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,CA,TT,TC,GC,CA,AG,CG,TT,CA,TG,...,TC,AA,GA,GA,AA,GC,GA,GC,AA,GA
1,TA,AT,TG,CA,TG,AA,AG,CC,CT,TT,...,GT,TT,TA,AA,AA,CA,AA,CG,AA,TT
2,CT,TT,CA,CT,CT,TT,AA,CC,CT,TA,...,CA,TT,TC,TT,TT,TG,TT,AT,AA,TG
3,AA,AC,GC,GC,AA,AA,AA,TG,CA,AA,...,TA,AT,AC,AT,TC,TT,AC,TT,AA,TG
4,GA,CT,TA,AT,AA,TC,CT,TA,TA,GT,...,CG,CT,TA,AT,AA,CA,AT,AA,GC,GC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,AC,TG,CA,CG,GC,TT,GC,TT,GT,GG,...,GA,AA,AG,AT,AT,GA,CT,GA,TA,TT
14665,TC,AT,TC,AT,TT,CC,GA,TT,TA,AT,...,GA,AT,TA,TT,GC,TC,CT,TG,CC,CA
14666,TG,GA,AA,AA,AG,TA,TA,AT,GC,TC,...,AT,GA,AG,AG,AG,GT,GA,AG,GG,TC
14667,TT,CA,CT,TG,CT,GA,TA,AA,CA,AA,...,GT,TC,AT,GT,GA,AG,GT,AG,GT,GA


In [3]:
combined_df['label']=1
combined_df

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,CA,TT,TC,GC,CA,AG,CG,TT,CA,TG,...,AA,GA,GA,AA,GC,GA,GC,AA,GA,1
1,TA,AT,TG,CA,TG,AA,AG,CC,CT,TT,...,TT,TA,AA,AA,CA,AA,CG,AA,TT,1
2,CT,TT,CA,CT,CT,TT,AA,CC,CT,TA,...,TT,TC,TT,TT,TG,TT,AT,AA,TG,1
3,AA,AC,GC,GC,AA,AA,AA,TG,CA,AA,...,AT,AC,AT,TC,TT,AC,TT,AA,TG,1
4,GA,CT,TA,AT,AA,TC,CT,TA,TA,GT,...,CT,TA,AT,AA,CA,AT,AA,GC,GC,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,AC,TG,CA,CG,GC,TT,GC,TT,GT,GG,...,AA,AG,AT,AT,GA,CT,GA,TA,TT,1
14665,TC,AT,TC,AT,TT,CC,GA,TT,TA,AT,...,AT,TA,TT,GC,TC,CT,TG,CC,CA,1
14666,TG,GA,AA,AA,AG,TA,TA,AT,GC,TC,...,GA,AG,AG,AG,GT,GA,AG,GG,TC,1
14667,TT,CA,CT,TG,CT,GA,TA,AA,CA,AA,...,TC,AT,GT,GA,AG,GT,AG,GT,GA,1


# Train Organism Downstream Data

In [4]:
downstream_directory = "D:\OHE Data\Downstream data\Training Data"

# Initialize an empty list to store DataFrames from each file
downstream_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_directory):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_directory, filename), "r") as file:
            downstream_sequence_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_sequence_data = downstream_sequence_data.replace('\n', '')

        # Split the sequence data into dinucleotides
        downstream_dinucleotides = [downstream_sequence_data[i:i+2] for i in range(0, len(downstream_sequence_data), 2)]

        # Split dinucleotides into rows of 50 columns
        downstream_rows = [downstream_dinucleotides[i:i+50] for i in range(0, len(downstream_dinucleotides), 50)]

        # Create a DataFrame for the current file
        downstream_df = pd.DataFrame(downstream_rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        downstream_df.columns = column_names

        # Append the DataFrame to the list
        downstream_dfs.append(downstream_df)

# Concatenate all DataFrames into one
downstream_dataframe = pd.concat(downstream_dfs, ignore_index=True)

# Now, combined_df contains the data with dinucleotides in each row and columns starting from -80
downstream_dataframe


Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,AG,TA,TC,AT,TT,AC,AG,GA,AG,CG,...,CG,AT,CG,AT,CA,GC,CG,GG,AA,GT
1,GG,AG,AA,GG,AA,CC,GT,TC,AG,GC,...,AC,CG,GT,AA,TT,GC,CG,CA,TA,CG
2,TG,GA,AG,AC,AA,CC,CG,TA,TC,TC,...,CA,CA,GG,TA,TA,AA,CA,GC,TT,GA
3,GT,GA,AT,TT,GT,TC,GG,CG,GG,AG,...,CC,CC,GG,CA,TC,GC,TG,CT,TT,AA
4,AA,AG,AC,GC,TC,TT,GA,AA,TT,TA,...,TG,GT,GA,AG,TC,TC,TG,AT,GA,TA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,AT,TG,CT,GG,TT,GG,TT,GT,TT,GT,...,GA,CC,CG,TC,AA,GA,GT,TG,CC,CA
14665,GA,AG,TC,AC,CA,GT,CT,GC,CC,CA,...,TC,TA,CA,GC,AA,TT,AC,TC,CT,AC
14666,TT,TT,TG,GC,GT,AA,AT,TA,GC,AA,...,AA,GA,AG,AT,TT,TA,GA,CA,GT,TG
14667,CG,AA,TG,GA,AG,AT,CT,TT,TA,CA,...,AT,TA,GA,CC,GT,AT,TT,TT,AC,AC


In [5]:
downstream_dataframe['label']=0
downstream_dataframe

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,AG,TA,TC,AT,TT,AC,AG,GA,AG,CG,...,AT,CG,AT,CA,GC,CG,GG,AA,GT,0
1,GG,AG,AA,GG,AA,CC,GT,TC,AG,GC,...,CG,GT,AA,TT,GC,CG,CA,TA,CG,0
2,TG,GA,AG,AC,AA,CC,CG,TA,TC,TC,...,CA,GG,TA,TA,AA,CA,GC,TT,GA,0
3,GT,GA,AT,TT,GT,TC,GG,CG,GG,AG,...,CC,GG,CA,TC,GC,TG,CT,TT,AA,0
4,AA,AG,AC,GC,TC,TT,GA,AA,TT,TA,...,GT,GA,AG,TC,TC,TG,AT,GA,TA,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,AT,TG,CT,GG,TT,GG,TT,GT,TT,GT,...,CC,CG,TC,AA,GA,GT,TG,CC,CA,0
14665,GA,AG,TC,AC,CA,GT,CT,GC,CC,CA,...,TA,CA,GC,AA,TT,AC,TC,CT,AC,0
14666,TT,TT,TG,GC,GT,AA,AT,TA,GC,AA,...,GA,AG,AT,TT,TA,GA,CA,GT,TG,0
14667,CG,AA,TG,GA,AG,AT,CT,TT,TA,CA,...,TA,GA,CC,GT,AT,TT,TT,AC,AC,0


In [6]:
# Concatenate the two DataFrames vertically
data = pd.concat([combined_df, downstream_dataframe], ignore_index=True)
data

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,CA,TT,TC,GC,CA,AG,CG,TT,CA,TG,...,AA,GA,GA,AA,GC,GA,GC,AA,GA,1
1,TA,AT,TG,CA,TG,AA,AG,CC,CT,TT,...,TT,TA,AA,AA,CA,AA,CG,AA,TT,1
2,CT,TT,CA,CT,CT,TT,AA,CC,CT,TA,...,TT,TC,TT,TT,TG,TT,AT,AA,TG,1
3,AA,AC,GC,GC,AA,AA,AA,TG,CA,AA,...,AT,AC,AT,TC,TT,AC,TT,AA,TG,1
4,GA,CT,TA,AT,AA,TC,CT,TA,TA,GT,...,CT,TA,AT,AA,CA,AT,AA,GC,GC,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29333,AT,TG,CT,GG,TT,GG,TT,GT,TT,GT,...,CC,CG,TC,AA,GA,GT,TG,CC,CA,0
29334,GA,AG,TC,AC,CA,GT,CT,GC,CC,CA,...,TA,CA,GC,AA,TT,AC,TC,CT,AC,0
29335,TT,TT,TG,GC,GT,AA,AT,TA,GC,AA,...,GA,AG,AT,TT,TA,GA,CA,GT,TG,0
29336,CG,AA,TG,GA,AG,AT,CT,TT,TA,CA,...,TA,GA,CC,GT,AT,TT,TT,AC,AC,0


# Model Training

In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Define features and target
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

# Create a pipeline with encoding and model
pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse=False, dtype=int)),
    ('model', XGBClassifier(
        colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_depth=None, min_child_weight=5,
        n_estimators=300, subsample=1.0, n_jobs=-1, random_state=101
    ))
])

# Create KFold cross-validation iterator
kf = KFold(n_splits=10, shuffle=True, random_state=101)

# Initialize lists to store evaluation results
accuracy_list = []
precision_list = []
recall_list = []
f1_score_list = []
mcc_list = []
specificity_list = []

# Perform KFold cross-validation
fold_no = 1
for train_idx, test_idx in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Fit the pipeline on the training fold
    pipeline.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred = pipeline.predict(X_val_fold)

    # Calculate metrics and store in lists
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)
    mcc = matthews_corrcoef(y_val_fold, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_pred).ravel()
    specificity = tn / (tn + fp)

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1)
    mcc_list.append(mcc)
    specificity_list.append(specificity)

    # Print the results for this fold
    print("Fold {}: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, MCC: {:.4f}, Specificity: {:.4f}".format(
        fold_no, accuracy, precision, recall, f1, mcc, specificity))

    fold_no += 1

# Calculate and print the mean and standard deviation of each metric
print("Mean Accuracy: {:.4f}, Std Accuracy: {:.4f}".format(np.mean(accuracy_list), np.std(accuracy_list)))
print("Mean Precision: {:.4f}, Std Precision: {:.4f}".format(np.mean(precision_list), np.std(precision_list)))
print("Mean Recall: {:.4f}, Std Recall: {:.4f}".format(np.mean(recall_list), np.std(recall_list)))
print("Mean F1 Score: {:.4f}, Std F1 Score: {:.4f}".format(np.mean(f1_score_list), np.std(f1_score_list)))
print("Mean MCC: {:.4f}, Std MCC: {:.4f}".format(np.mean(mcc_list), np.std(mcc_list)))
print("Mean Specificity: {:.4f}, Std Specificity: {:.4f}".format(np.mean(specificity_list), np.std(specificity_list)))


Fold 1: Accuracy: 0.8716, Precision: 0.8912, Recall: 0.8551, F1 Score: 0.8728, MCC: 0.7440, Specificity: 0.8891
Fold 2: Accuracy: 0.8754, Precision: 0.8840, Recall: 0.8588, F1 Score: 0.8712, MCC: 0.7509, Specificity: 0.8914
Fold 3: Accuracy: 0.8811, Precision: 0.8974, Recall: 0.8638, F1 Score: 0.8802, MCC: 0.7628, Specificity: 0.8989
Fold 4: Accuracy: 0.8648, Precision: 0.8745, Recall: 0.8513, F1 Score: 0.8627, MCC: 0.7299, Specificity: 0.8783
Fold 5: Accuracy: 0.8708, Precision: 0.8812, Recall: 0.8579, F1 Score: 0.8694, MCC: 0.7419, Specificity: 0.8838
Fold 6: Accuracy: 0.8807, Precision: 0.8924, Recall: 0.8705, F1 Score: 0.8814, MCC: 0.7616, Specificity: 0.8912
Fold 7: Accuracy: 0.8674, Precision: 0.8869, Recall: 0.8472, F1 Score: 0.8666, MCC: 0.7357, Specificity: 0.8883
Fold 8: Accuracy: 0.8773, Precision: 0.8749, Recall: 0.8708, F1 Score: 0.8728, MCC: 0.7543, Specificity: 0.8833
Fold 9: Accuracy: 0.8633, Precision: 0.8765, Recall: 0.8438, F1 Score: 0.8598, MCC: 0.7269, Specificity:

In [8]:
import pickle

# Save the trained model as a pickle file
with open('ohe_dinucleotide.pickle', 'wb') as file:
    pickle.dump(pipeline, file)


## Test Organism Promoter data

In [23]:
test_directory = "D:\OHE Data\Promoter data\Test Data"

# Initialize an empty list to store DataFrames from each file
test_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(test_directory):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(test_directory, filename), "r") as file:
            test_sequence_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        test_sequence_data = test_sequence_data.replace('\n', '')

        # Split the sequence data into dinucleotides
        test_dinucleotides = [test_sequence_data[i:i+2] for i in range(0, len(test_sequence_data), 2)]

        # Split dinucleotides into rows of 50 columns
        test_rows = [test_dinucleotides[i:i+50] for i in range(0, len(test_dinucleotides), 50)]

        # Create a DataFrame for the current file
        test_df = pd.DataFrame(test_rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        test_df.columns = column_names

        # Append the DataFrame to the list
        test_dfs.append(test_df)

# Concatenate all DataFrames into one
test_df = pd.concat(test_dfs, ignore_index=True)

# Now, combined_df contains the data
test_df


Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,TC,CA,TA,TC,AG,TA,AA,AT,TT,CT,...,TA,CG,GA,TT,GT,CC,TC,TT,TG,CC
1,CC,TG,GA,AT,GA,AT,TG,CC,GC,GC,...,GA,AA,TT,AC,CT,GC,AA,GG,AC,TG
2,CG,CA,GA,GT,GT,GA,TA,AA,CT,AT,...,TG,CC,GG,CA,TG,GA,CA,AG,AG,GA
3,TA,AT,CT,GC,AT,AG,TT,TA,TC,AC,...,AG,AC,CG,GT,TA,CA,TC,CC,CT,TA
4,TG,CT,GT,AA,TC,GC,GC,AG,GT,GG,...,TA,AC,GC,CA,CA,GT,TT,GA,AC,GG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,CT,TT,CT,AA,AC,TC,AT,TC,TA,AA,...,GA,AA,TT,TT,TT,CC,TG,CC,AG,CG
2306,AC,GA,TC,TG,AA,AG,TT,CA,GT,AA,...,TA,TC,GC,GA,AT,GA,GT,AG,CC,TG
2307,TT,AA,AT,AT,CT,CT,GA,TC,CA,GA,...,GA,CT,AA,TT,CT,GA,TG,CC,AG,CA
2308,GC,CG,CC,GG,AA,GG,AA,TG,CT,GG,...,TA,TG,TA,GT,GA,TG,TT,GT,AT,AA


In [25]:
test_df['label']=1
test_df

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,TC,CA,TA,TC,AG,TA,AA,AT,TT,CT,...,CG,GA,TT,GT,CC,TC,TT,TG,CC,1
1,CC,TG,GA,AT,GA,AT,TG,CC,GC,GC,...,AA,TT,AC,CT,GC,AA,GG,AC,TG,1
2,CG,CA,GA,GT,GT,GA,TA,AA,CT,AT,...,CC,GG,CA,TG,GA,CA,AG,AG,GA,1
3,TA,AT,CT,GC,AT,AG,TT,TA,TC,AC,...,AC,CG,GT,TA,CA,TC,CC,CT,TA,1
4,TG,CT,GT,AA,TC,GC,GC,AG,GT,GG,...,AC,GC,CA,CA,GT,TT,GA,AC,GG,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,CT,TT,CT,AA,AC,TC,AT,TC,TA,AA,...,AA,TT,TT,TT,CC,TG,CC,AG,CG,1
2306,AC,GA,TC,TG,AA,AG,TT,CA,GT,AA,...,TC,GC,GA,AT,GA,GT,AG,CC,TG,1
2307,TT,AA,AT,AT,CT,CT,GA,TC,CA,GA,...,CT,AA,TT,CT,GA,TG,CC,AG,CA,1
2308,GC,CG,CC,GG,AA,GG,AA,TG,CT,GG,...,TG,TA,GT,GA,TG,TT,GT,AT,AA,1


## Test downstream data

In [24]:
downstream_test_directory = "D:\OHE Data\Downstream data\Test Data"

# Initialize an empty list to store DataFrames from each file
downstream_test_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_test_directory):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_test_directory, filename), "r") as file:
            downstream_test_sequence_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_test_sequence_data = downstream_test_sequence_data.replace('\n', '')

        # Split the sequence data into dinucleotides
        downstream_test_dinucleotides = [downstream_test_sequence_data[i:i+2] for i in range(0, len(downstream_test_sequence_data), 2)]

        # Split dinucleotides into rows of 50 columns
        downstream_test_rows = [downstream_test_dinucleotides[i:i+50] for i in range(0, len(downstream_test_dinucleotides), 50)]

        # Create a DataFrame for the current file
        downstream_test_df = pd.DataFrame(downstream_test_rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        downstream_test_df.columns = column_names

        # Append the DataFrame to the list
        downstream_test_dfs.append(downstream_test_df)

# Concatenate all DataFrames into one
downstream_test_df = pd.concat(downstream_test_dfs, ignore_index=True)

# Now, combined_df contains the data
downstream_test_df


Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,GG,AG,GA,AA,CA,TG,CG,TA,TT,TT,...,AT,GA,AG,AT,AA,AT,TA,AC,GC,CA
1,CA,CC,GC,AA,AT,CG,GC,AA,GC,TG,...,GA,CT,TT,AC,GC,CG,CA,AA,AT,GC
2,AA,TT,CG,GC,CG,CA,GT,CT,AA,AA,...,GC,AG,CC,CG,GG,TC,GG,CA,GC,GG
3,CT,TC,CC,TG,AT,TC,TC,GG,CG,CG,...,AG,CA,GC,GA,TC,GC,GC,GC,AG,CA
4,CT,GC,AA,AC,CC,GT,CT,GA,TT,CC,...,TC,TG,GT,GC,TG,AC,CA,CC,GG,CG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,CT,GG,GA,CT,TT,TT,GT,CA,TT,CT,...,TT,AT,AC,GC,TC,AC,GA,AT,AG,TC
2306,CC,GG,CG,CG,GC,GA,AA,AT,CG,TC,...,CA,GG,CA,TC,TG,TA,AA,GA,TA,TA
2307,TA,AT,GA,AT,GT,TA,AA,AA,CA,CC,...,AA,AC,TG,GA,TG,TC,AC,TC,AA,CC
2308,CG,TT,CG,AC,TA,TC,TC,AA,AA,CC,...,GG,AT,AT,CT,GA,GT,CA,GT,TA,CT


In [26]:
downstream_test_df['label']=0
downstream_test_df

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,GG,AG,GA,AA,CA,TG,CG,TA,TT,TT,...,GA,AG,AT,AA,AT,TA,AC,GC,CA,0
1,CA,CC,GC,AA,AT,CG,GC,AA,GC,TG,...,CT,TT,AC,GC,CG,CA,AA,AT,GC,0
2,AA,TT,CG,GC,CG,CA,GT,CT,AA,AA,...,AG,CC,CG,GG,TC,GG,CA,GC,GG,0
3,CT,TC,CC,TG,AT,TC,TC,GG,CG,CG,...,CA,GC,GA,TC,GC,GC,GC,AG,CA,0
4,CT,GC,AA,AC,CC,GT,CT,GA,TT,CC,...,TG,GT,GC,TG,AC,CA,CC,GG,CG,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,CT,GG,GA,CT,TT,TT,GT,CA,TT,CT,...,AT,AC,GC,TC,AC,GA,AT,AG,TC,0
2306,CC,GG,CG,CG,GC,GA,AA,AT,CG,TC,...,GG,CA,TC,TG,TA,AA,GA,TA,TA,0
2307,TA,AT,GA,AT,GT,TA,AA,AA,CA,CC,...,AC,TG,GA,TG,TC,AC,TC,AA,CC,0
2308,CG,TT,CG,AC,TA,TC,TC,AA,AA,CC,...,AT,AT,CT,GA,GT,CA,GT,TA,CT,0


In [27]:
test= pd.concat([test_df, downstream_test_df], ignore_index= True)
test

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,TC,CA,TA,TC,AG,TA,AA,AT,TT,CT,...,CG,GA,TT,GT,CC,TC,TT,TG,CC,1
1,CC,TG,GA,AT,GA,AT,TG,CC,GC,GC,...,AA,TT,AC,CT,GC,AA,GG,AC,TG,1
2,CG,CA,GA,GT,GT,GA,TA,AA,CT,AT,...,CC,GG,CA,TG,GA,CA,AG,AG,GA,1
3,TA,AT,CT,GC,AT,AG,TT,TA,TC,AC,...,AC,CG,GT,TA,CA,TC,CC,CT,TA,1
4,TG,CT,GT,AA,TC,GC,GC,AG,GT,GG,...,AC,GC,CA,CA,GT,TT,GA,AC,GG,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4615,CT,GG,GA,CT,TT,TT,GT,CA,TT,CT,...,AT,AC,GC,TC,AC,GA,AT,AG,TC,0
4616,CC,GG,CG,CG,GC,GA,AA,AT,CG,TC,...,GG,CA,TC,TG,TA,AA,GA,TA,TA,0
4617,TA,AT,GA,AT,GT,TA,AA,AA,CA,CC,...,AC,TG,GA,TG,TC,AC,TC,AA,CC,0
4618,CG,TT,CG,AC,TA,TC,TC,AA,AA,CC,...,AT,AT,CT,GA,GT,CA,GT,TA,CT,0


## Test Predictions

In [28]:
X_test = test.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_test = pipeline.predict(X_test)

# Predictions
y_true_test = test['label']

# Calculate evaluation metrics for "Bacillus" data
accuracy_test = accuracy_score(y_true_test, y_pred_test)
precision_test = precision_score(y_true_test, y_pred_test)
recall_test = recall_score(y_true_test, y_pred_test)
f1_score_test = f1_score(y_true_test, y_pred_test)
mcc_test = matthews_corrcoef(y_true_test, y_pred_test)
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_true_test, y_pred_test).ravel()
specificity_test = tn_test / (tn_test + fp_test)

# Print the results for "Test" data
print("Accuracy on Test Data:", accuracy_test)
print("Precision on Test Data:", precision_test)
print("Recall on Test Data:", recall_test)
print("F1 Score on Test Data:", f1_score_test)
print("MCC on Test Data:", mcc_test)
print("Specificity on Test Data:", specificity_test)


Accuracy on Test Data: 0.9136363636363637
Precision on Test Data: 0.8924024640657084
Recall on Test Data: 0.9406926406926407
F1 Score on Test Data: 0.9159114857744994
MCC on Test Data: 0.8284865906973271
Specificity on Test Data: 0.8865800865800866


# Bacillus

In [31]:

Bacillus_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequence_Bacillus_amyloliquefaciens.txt"
Bacillus_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Bacillus amyloliquefaciens.txt"

#Promoter seq data transformation
with open(Bacillus_file_path, "r") as Bacillus_file:
    Bacillus_sequence_data = Bacillus_file.read()

Bacillus_sequence_data = Bacillus_sequence_data.replace('\n', '')

Bacillus_dinucleotides = [Bacillus_sequence_data[i:i+2] for i in range(0, len(Bacillus_sequence_data), 2)]

Bacillus_rows = [Bacillus_dinucleotides[i:i+50] for i in range(0, len(Bacillus_dinucleotides), 50)]

Bacillus_df = pd.DataFrame(Bacillus_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Bacillus_df.columns = column_names
Bacillus_df['label'] = 1


#Downstream data transformation: 

with open(Bacillus_downstream_file_path, "r") as Bacillus_downstream_file:
    Bacillus_downstream_sequence_data = Bacillus_downstream_file.read()

Bacillus_downstream_sequence_data = Bacillus_downstream_sequence_data.replace('\n', '')

Bacillus_downstream_dinucleotides = [Bacillus_downstream_sequence_data[i:i+2] for i in range(0, len(Bacillus_downstream_sequence_data), 2)]

Bacillus_downstream_rows = [Bacillus_downstream_dinucleotides[i:i+50] for i in range(0, len(Bacillus_downstream_dinucleotides), 50)]

Bacillus_downstream_df = pd.DataFrame(Bacillus_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Bacillus_downstream_df.columns = column_names
Bacillus_downstream_df['label'] = 0

Bacillus = pd.concat([Bacillus_df, Bacillus_downstream_df], ignore_index=True)
X_Bac = Bacillus.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Bac = pipeline.predict(X_Bac)

# Predictions
y_true_Bac = Bacillus['label']

# Calculate evaluation metrics for "Bacillus" data
accuracy_Bac = accuracy_score(y_true_Bac, y_pred_Bac)
precision_Bac = precision_score(y_true_Bac, y_pred_Bac)
recall_Bac = recall_score(y_true_Bac, y_pred_Bac)
f1_score_Bac = f1_score(y_true_Bac, y_pred_Bac)
mcc_Bac = matthews_corrcoef(y_true_Bac, y_pred_Bac)
tn_Bac, fp_Bac, fn_Bac, tp_Bac = confusion_matrix(y_true_Bac, y_pred_Bac).ravel()
specificity_Bac = tn_Bac / (tn_Bac + fp_Bac)

# Print the results for "Bacillus" data
print("Accuracy on Bacillus Data:", accuracy_Bac)
print("Precision on Bacillus Data:", precision_Bac)
print("Recall on Bacillus Data:", recall_Bac)
print("F1 Score on Bacillus Data:", f1_score_Bac)
print("MCC on Bacillus Data:", mcc_Bac)
print("Specificity on Bacillus Data:", specificity_Bac)


Accuracy on Bacillus Data: 0.779508970727101
Precision on Bacillus Data: 0.8737373737373737
Recall on Bacillus Data: 0.6534466477809254
F1 Score on Bacillus Data: 0.7477039438141544
MCC on Bacillus Data: 0.5776800291399706
Specificity on Bacillus Data: 0.9055712936732767


# Chlamydia pneumoniae

In [32]:
Chlamydia_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Chlamydia pneumoniae.txt"
Chlamydia_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Chlamydia pneumoniae.txt"

# Promoter seq data transformation
with open(Chlamydia_file_path, "r") as Chlamydia_file:
    Chlamydia_sequence_data = Chlamydia_file.read()

Chlamydia_sequence_data = Chlamydia_sequence_data.replace('\n', '')

Chlamydia_dinucleotides = [Chlamydia_sequence_data[i:i+2] for i in range(0, len(Chlamydia_sequence_data), 2)]

Chlamydia_rows = [Chlamydia_dinucleotides[i:i+50] for i in range(0, len(Chlamydia_dinucleotides), 50)]

Chlamydia_df = pd.DataFrame(Chlamydia_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Chlamydia_df.columns = column_names
Chlamydia_df['label'] = 1

# Downstream data transformation:
with open(Chlamydia_downstream_file_path, "r") as Chlamydia_downstream_file:
    Chlamydia_downstream_sequence_data = Chlamydia_downstream_file.read()

Chlamydia_downstream_sequence_data = Chlamydia_downstream_sequence_data.replace('\n', '')

Chlamydia_downstream_dinucleotides = [Chlamydia_downstream_sequence_data[i:i+2] for i in range(0, len(Chlamydia_downstream_sequence_data), 2)]

Chlamydia_downstream_rows = [Chlamydia_downstream_dinucleotides[i:i+50] for i in range(0, len(Chlamydia_downstream_dinucleotides), 50)]

Chlamydia_downstream_df = pd.DataFrame(Chlamydia_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Chlamydia_downstream_df.columns = column_names
Chlamydia_downstream_df['label'] = 0

Chlamydia = pd.concat([Chlamydia_df, Chlamydia_downstream_df], ignore_index=True)
X_Chlm = Chlamydia.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Chlm = pipeline.predict(X_Chlm)

# Predictions
y_true_Chlm = Chlamydia['label']

# Calculate evaluation metrics for "Chlamydia" data
accuracy_Chlm = accuracy_score(y_true_Chlm, y_pred_Chlm)
precision_Chlm = precision_score(y_true_Chlm, y_pred_Chlm)
recall_Chlm = recall_score(y_true_Chlm, y_pred_Chlm)
f1_score_Chlm = f1_score(y_true_Chlm, y_pred_Chlm)
mcc_Chlm = matthews_corrcoef(y_true_Chlm, y_pred_Chlm)
tn_Chlm, fp_Chlm, fn_Chlm, tp_Chlm = confusion_matrix(y_true_Chlm, y_pred_Chlm).ravel()
specificity_Chlm = tn_Chlm / (tn_Chlm + fp_Chlm)

# Print the results for "Chlamydia" data
print("Accuracy on Chlamydia Data:", accuracy_Chlm)
print("Precision on Chlamydia Data:", precision_Chlm)
print("Recall on Chlamydia Data:", recall_Chlm)
print("F1 Score on Chlamydia Data:", f1_score_Chlm)
print("MCC on Chlamydia Data:", mcc_Chlm)
print("Specificity on Chlamydia Data:", specificity_Chlm)


Accuracy on Chlamydia Data: 0.8886138613861386
Precision on Chlamydia Data: 0.8536036036036037
Recall on Chlamydia Data: 0.9381188118811881
F1 Score on Chlamydia Data: 0.8938679245283019
MCC on Chlamydia Data: 0.7810655286615585
Specificity on Chlamydia Data: 0.8391089108910891


# Corynebacterium

In [34]:

Corynebacterium_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Corynebacterium glutamicum.txt"
Corynebacterium_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Corynebacterium glutamicum.txt"

# Promoter seq data transformation
with open(Corynebacterium_file_path, "r") as Corynebacterium_file:
    Corynebacterium_sequence_data = Corynebacterium_file.read()

Corynebacterium_sequence_data = Corynebacterium_sequence_data.replace('\n', '')

Corynebacterium_dinucleotides = [Corynebacterium_sequence_data[i:i+2] for i in range(0, len(Corynebacterium_sequence_data), 2)]

Corynebacterium_rows = [Corynebacterium_dinucleotides[i:i+50] for i in range(0, len(Corynebacterium_dinucleotides), 50)]

Corynebacterium_df = pd.DataFrame(Corynebacterium_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Corynebacterium_df.columns = column_names
Corynebacterium_df['label'] = 1

# Downstream data transformation:

with open(Corynebacterium_downstream_file_path, "r") as Corynebacterium_downstream_file:
    Corynebacterium_downstream_sequence_data = Corynebacterium_downstream_file.read()

Corynebacterium_downstream_sequence_data = Corynebacterium_downstream_sequence_data.replace('\n', '')

Corynebacterium_downstream_dinucleotides = [Corynebacterium_downstream_sequence_data[i:i+2] for i in range(0, len(Corynebacterium_downstream_sequence_data), 2)]

Corynebacterium_downstream_rows = [Corynebacterium_downstream_dinucleotides[i:i+50] for i in range(0, len(Corynebacterium_downstream_dinucleotides), 50)]

Corynebacterium_downstream_df = pd.DataFrame(Corynebacterium_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Corynebacterium_downstream_df.columns = column_names
Corynebacterium_downstream_df['label'] = 0

Corynebacterium = pd.concat([Corynebacterium_df, Corynebacterium_downstream_df], ignore_index=True)
X_Corn = Corynebacterium.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Corn = pipeline.predict(X_Corn)

# Predictions
y_true_Corn = Corynebacterium['label']

# Calculate evaluation metrics for "Corynebacterium" data
accuracy_Corn = accuracy_score(y_true_Corn, y_pred_Corn)
precision_Corn = precision_score(y_true_Corn, y_pred_Corn)
recall_Corn = recall_score(y_true_Corn, y_pred_Corn)
f1_score_Corn = f1_score(y_true_Corn, y_pred_Corn)
mcc_Corn = matthews_corrcoef(y_true_Corn, y_pred_Corn)
tn_Corn, fp_Corn, fn_Corn, tp_Corn = confusion_matrix(y_true_Corn, y_pred_Corn).ravel()
specificity_Corn = tn_Corn / (tn_Corn + fp_Corn)

# Print the results for "Corynebacterium" data
print("Accuracy on Corynebacterium Data:", accuracy_Corn)
print("Precision on Corynebacterium Data:", precision_Corn)
print("Recall on Corynebacterium Data:", recall_Corn)
print("F1 Score on Corynebacterium Data:", f1_score_Corn)
print("MCC on Corynebacterium Data:", mcc_Corn)
print("Specificity on Corynebacterium Data:", specificity_Corn)


Accuracy on Corynebacterium Data: 0.9460869565217391
Precision on Corynebacterium Data: 0.9492119089316988
Recall on Corynebacterium Data: 0.9426086956521739
F1 Score on Corynebacterium Data: 0.9458987783595112
MCC on Corynebacterium Data: 0.8921955013999718
Specificity on Corynebacterium Data: 0.9495652173913044


# E.coli

In [35]:
# Define file paths with "Ecoli" instead of "Bacillus"
Ecoli_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences E.coli.txt"
Ecoli_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\E.coli.txt"

# Promoter seq data transformation
with open(Ecoli_file_path, "r") as Ecoli_file:
    Ecoli_sequence_data = Ecoli_file.read()

Ecoli_sequence_data = Ecoli_sequence_data.replace('\n', '')

Ecoli_dinucleotides = [Ecoli_sequence_data[i:i+2] for i in range(0, len(Ecoli_sequence_data), 2)]

Ecoli_rows = [Ecoli_dinucleotides[i:i+50] for i in range(0, len(Ecoli_dinucleotides), 50)]

Ecoli_df = pd.DataFrame(Ecoli_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Ecoli_df.columns = column_names
Ecoli_df['label'] = 1

# Downstream data transformation:
with open(Ecoli_downstream_file_path, "r") as Ecoli_downstream_file:
    Ecoli_downstream_sequence_data = Ecoli_downstream_file.read()

Ecoli_downstream_sequence_data = Ecoli_downstream_sequence_data.replace('\n', '')

Ecoli_downstream_dinucleotides = [Ecoli_downstream_sequence_data[i:i+2] for i in range(0, len(Ecoli_downstream_sequence_data), 2)]

Ecoli_downstream_rows = [Ecoli_downstream_dinucleotides[i:i+50] for i in range(0, len(Ecoli_downstream_dinucleotides), 50)]

Ecoli_downstream_df = pd.DataFrame(Ecoli_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Ecoli_downstream_df.columns = column_names
Ecoli_downstream_df['label'] = 0

Ecoli = pd.concat([Ecoli_df, Ecoli_downstream_df], ignore_index=True)
X_Ecoli = Ecoli.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Ecoli = pipeline.predict(X_Ecoli)

# Predictions
y_true_Ecoli = Ecoli['label']

# Calculate evaluation metrics for "Ecoli" data
accuracy_Ecoli = accuracy_score(y_true_Ecoli, y_pred_Ecoli)
precision_Ecoli = precision_score(y_true_Ecoli, y_pred_Ecoli)
recall_Ecoli = recall_score(y_true_Ecoli, y_pred_Ecoli)
f1_score_Ecoli = f1_score(y_true_Ecoli, y_pred_Ecoli)
mcc_Ecoli = matthews_corrcoef(y_true_Ecoli, y_pred_Ecoli)
tn_Ecoli, fp_Ecoli, fn_Ecoli, tp_Ecoli = confusion_matrix(y_true_Ecoli, y_pred_Ecoli).ravel()
specificity_Ecoli = tn_Ecoli / (tn_Ecoli + fp_Ecoli)

# Print the results for "Ecoli" data
print("Accuracy on Ecoli Data:", accuracy_Ecoli)
print("Precision on Ecoli Data:", precision_Ecoli)
print("Recall on Ecoli Data:", recall_Ecoli)
print("F1 Score on Ecoli Data:", f1_score_Ecoli)
print("MCC on Ecoli Data:", mcc_Ecoli)
print("Specificity on Ecoli Data:", specificity_Ecoli)


Accuracy on Ecoli Data: 0.9249812453113279
Precision on Ecoli Data: 0.9144111192392099
Recall on Ecoli Data: 0.9377344336084021
F1 Score on Ecoli Data: 0.9259259259259259
MCC on Ecoli Data: 0.8502391078788532
Specificity on Ecoli Data: 0.9122280570142536


# Helicobactor

In [41]:
Helicobactor_file_path = r"D:\OHE Data\Promoter data\Training Data\any except ATGC Helicobacter_pylori.txt"
Helicobactor_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Helicobacter pylori.txt"

#Promoter seq data transformation
with open(Helicobactor_file_path, "r") as Helicobactor_file:
    Helicobactor_sequence_data = Helicobactor_file.read()

Helicobactor_sequence_data = Helicobactor_sequence_data.replace('\n', '')

Helicobactor_dinucleotides = [Helicobactor_sequence_data[i:i+2] for i in range(0, len(Helicobactor_sequence_data), 2)]

Helicobactor_rows = [Helicobactor_dinucleotides[i:i+50] for i in range(0, len(Helicobactor_dinucleotides), 50)]

Helicobactor_df = pd.DataFrame(Helicobactor_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Helicobactor_df.columns = column_names
Helicobactor_df['label'] = 1


#Downstream data transformation: 

with open(Helicobactor_downstream_file_path, "r") as Helicobactor_downstream_file:
    Helicobactor_downstream_sequence_data = Helicobactor_downstream_file.read()

Helicobactor_downstream_sequence_data = Helicobactor_downstream_sequence_data.replace('\n', '')

Helicobactor_downstream_dinucleotides = [Helicobactor_downstream_sequence_data[i:i+2] for i in range(0, len(Helicobactor_downstream_sequence_data), 2)]

Helicobactor_downstream_rows = [Helicobactor_downstream_dinucleotides[i:i+50] for i in range(0, len(Helicobactor_downstream_dinucleotides), 50)]

Helicobactor_downstream_df = pd.DataFrame(Helicobactor_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Helicobactor_downstream_df.columns = column_names
Helicobactor_downstream_df['label'] = 0

Helicobactor = pd.concat([Helicobactor_df, Helicobactor_downstream_df], ignore_index=True)
X_Helico = Helicobactor.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Helico = pipeline.predict(X_Helico)

# Predictions
y_true_Helico = Helicobactor['label']

# Calculate evaluation metrics for "Helicobactor" data
accuracy_Helico = accuracy_score(y_true_Helico, y_pred_Helico)
precision_Helico = precision_score(y_true_Helico, y_pred_Helico)
recall_Helico = recall_score(y_true_Helico, y_pred_Helico)
f1_score_Helico = f1_score(y_true_Helico, y_pred_Helico)
mcc_Helico = matthews_corrcoef(y_true_Helico, y_pred_Helico)
tn_Helico, fp_Helico, fn_Helico, tp_Helico = confusion_matrix(y_true_Helico, y_pred_Helico).ravel()
specificity_Helico = tn_Helico / (tn_Helico + fp_Helico)

# Print the results for "Helicobactor" data
print("Accuracy on Helicobactor Data:", accuracy_Helico)
print("Precision on Helicobactor Data:", precision_Helico)
print("Recall on Helicobactor Data:", recall_Helico)
print("F1 Score on Helicobactor Data:", f1_score_Helico)
print("MCC on Helicobactor Data:", mcc_Helico)
print("Specificity on Helicobactor Data:", specificity_Helico)


Accuracy on Helicobactor Data: 0.9192634560906515
Precision on Helicobactor Data: 0.8727959697732998
Recall on Helicobactor Data: 0.9815864022662889
F1 Score on Helicobactor Data: 0.924
MCC on Helicobactor Data: 0.8451177396492643
Specificity on Helicobactor Data: 0.8569405099150141


# Mycobacterium

In [37]:
Mycobacterium_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Mycobacterium tuberculosis.txt"
Mycobacterium_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Mycobacterium tuberculosis.txt"

# Promoter seq data transformation
with open(Mycobacterium_file_path, "r") as Mycobacterium_file:
    Mycobacterium_sequence_data = Mycobacterium_file.read()

Mycobacterium_sequence_data = Mycobacterium_sequence_data.replace('\n', '')

Mycobacterium_dinucleotides = [Mycobacterium_sequence_data[i:i+2] for i in range(0, len(Mycobacterium_sequence_data), 2)]

Mycobacterium_rows = [Mycobacterium_dinucleotides[i:i+50] for i in range(0, len(Mycobacterium_dinucleotides), 50)]

Mycobacterium_df = pd.DataFrame(Mycobacterium_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Mycobacterium_df.columns = column_names
Mycobacterium_df['label'] = 1

# Downstream data transformation:
with open(Mycobacterium_downstream_file_path, "r") as Mycobacterium_downstream_file:
    Mycobacterium_downstream_sequence_data = Mycobacterium_downstream_file.read()

Mycobacterium_downstream_sequence_data = Mycobacterium_downstream_sequence_data.replace('\n', '')

Mycobacterium_downstream_dinucleotides = [Mycobacterium_downstream_sequence_data[i:i+2] for i in range(0, len(Mycobacterium_downstream_sequence_data), 2)]

Mycobacterium_downstream_rows = [Mycobacterium_downstream_dinucleotides[i:i+50] for i in range(0, len(Mycobacterium_downstream_dinucleotides), 50)]

Mycobacterium_downstream_df = pd.DataFrame(Mycobacterium_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Mycobacterium_downstream_df.columns = column_names
Mycobacterium_downstream_df['label'] = 0

Mycobacterium = pd.concat([Mycobacterium_df, Mycobacterium_downstream_df], ignore_index=True)
X_Myco = Mycobacterium.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Myco = pipeline.predict(X_Myco)

# Predictions
y_true_Myco = Mycobacterium['label']

# Calculate evaluation metrics for "Mycobacterium" data
accuracy_Myco = accuracy_score(y_true_Myco, y_pred_Myco)
precision_Myco = precision_score(y_true_Myco, y_pred_Myco)
recall_Myco = recall_score(y_true_Myco, y_pred_Myco)
f1_score_Myco = f1_score(y_true_Myco, y_pred_Myco)
mcc_Myco = matthews_corrcoef(y_true_Myco, y_pred_Myco)
tn_Myco, fp_Myco, fn_Myco, tp_Myco = confusion_matrix(y_true_Myco, y_pred_Myco).ravel()
specificity_Myco = tn_Myco / (tn_Myco + fp_Myco)

# Print the results for "Mycobacterium" data
print("Accuracy on Mycobacterium Data:", accuracy_Myco)
print("Precision on Mycobacterium Data:", precision_Myco)
print("Recall on Mycobacterium Data:", recall_Myco)
print("F1 Score on Mycobacterium Data:", f1_score_Myco)
print("MCC on Mycobacterium Data:", mcc_Myco)
print("Specificity on Mycobacterium Data:", specificity_Myco)


Accuracy on Mycobacterium Data: 0.9446952595936795
Precision on Mycobacterium Data: 0.9696066746126341
Recall on Mycobacterium Data: 0.9181715575620768
F1 Score on Mycobacterium Data: 0.9431884057971015
MCC on Mycobacterium Data: 0.8906445509703125
Specificity on Mycobacterium Data: 0.9712189616252822


# Nostoc

In [44]:
Nostoc_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Nostoc sp.txt"
Nostoc_downstream_file_path = r"D:\OHE Data\Downstream data\Training Data\Nostoc sp.txt"

# Promoter seq data transformation
with open(Nostoc_file_path, "r") as Nostoc_file:
    Nostoc_sequence_data = Nostoc_file.read()

Nostoc_sequence_data = Nostoc_sequence_data.replace('\n', '')

Nostoc_dinucleotides = [Nostoc_sequence_data[i:i+2] for i in range(0, len(Nostoc_sequence_data), 2)]

Nostoc_rows = [Nostoc_dinucleotides[i:i+50] for i in range(0, len(Nostoc_dinucleotides), 50)]

Nostoc_df = pd.DataFrame(Nostoc_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Nostoc_df.columns = column_names
Nostoc_df['label'] = 1

# Downstream data transformation
with open(Nostoc_downstream_file_path, "r") as Nostoc_downstream_file:
    Nostoc_downstream_sequence_data = Nostoc_downstream_file.read()

Nostoc_downstream_sequence_data = Nostoc_downstream_sequence_data.replace('\n', '')

Nostoc_downstream_dinucleotides = [Nostoc_downstream_sequence_data[i:i+2] for i in range(0, len(Nostoc_downstream_sequence_data), 2)]

Nostoc_downstream_rows = [Nostoc_downstream_dinucleotides[i:i+50] for i in range(0, len(Nostoc_downstream_dinucleotides), 50)]

Nostoc_downstream_df = pd.DataFrame(Nostoc_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Nostoc_downstream_df.columns = column_names
Nostoc_downstream_df['label'] = 0

Nostoc = pd.concat([Nostoc_df, Nostoc_downstream_df], ignore_index=True)
X_Nos = Nostoc.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Nos = pipeline.predict(X_Nos)

# Predictions
y_true_Nos = Nostoc['label']

# Calculate evaluation metrics for "Nostoc" data
accuracy_Nos = accuracy_score(y_true_Nos, y_pred_Nos)
precision_Nos = precision_score(y_true_Nos, y_pred_Nos)
recall_Nos = recall_score(y_true_Nos, y_pred_Nos)
f1_score_Nos = f1_score(y_true_Nos, y_pred_Nos)
mcc_Nos = matthews_corrcoef(y_true_Nos, y_pred_Nos)
tn_Nos, fp_Nos, fn_Nos, tp_Nos = confusion_matrix(y_true_Nos, y_pred_Nos).ravel()
specificity_Nos = tn_Nos / (tn_Nos + fp_Nos)

# Print the results for "Nostoc" data
print("Accuracy on Nostoc Data:", accuracy_Nos)
print("Precision on Nostoc Data:", precision_Nos)
print("Recall on Nostoc Data:", recall_Nos)
print("F1 Score on Nostoc Data:", f1_score_Nos)
print("MCC on Nostoc Data:", mcc_Nos)
print("Specificity on Nostoc Data:", specificity_Nos)


Accuracy on Nostoc Data: 0.9278903456495828
Precision on Nostoc Data: 0.8939283101682517
Recall on Nostoc Data: 0.9709972189114024
F1 Score on Nostoc Data: 0.9308703104170634
MCC on Nostoc Data: 0.858978959150321
Specificity on Nostoc Data: 0.8847834723877632


# Pseudomonas

In [46]:
Pseudomonas_file_path = r"D:\OHE Data\Promoter data\Training Data\any except ATGC Pseudomonas aeruginosa.txt"
Pseudomonas_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Pseudomonas aeruginosa.txt"

# Promoter seq data transformation
with open(Pseudomonas_file_path, "r") as Pseudomonas_file:
    Pseudomonas_sequence_data = Pseudomonas_file.read()

Pseudomonas_sequence_data = Pseudomonas_sequence_data.replace('\n', '')

Pseudomonas_dinucleotides = [Pseudomonas_sequence_data[i:i+2] for i in range(0, len(Pseudomonas_sequence_data), 2)]

Pseudomonas_rows = [Pseudomonas_dinucleotides[i:i+50] for i in range(0, len(Pseudomonas_dinucleotides), 50)]

Pseudomonas_df = pd.DataFrame(Pseudomonas_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Pseudomonas_df.columns = column_names
Pseudomonas_df['label'] = 1

# Downstream data transformation:

with open(Pseudomonas_downstream_file_path, "r") as Pseudomonas_downstream_file:
    Pseudomonas_downstream_sequence_data = Pseudomonas_downstream_file.read()

Pseudomonas_downstream_sequence_data = Pseudomonas_downstream_sequence_data.replace('\n', '')

Pseudomonas_downstream_dinucleotides = [Pseudomonas_downstream_sequence_data[i:i+2] for i in range(0, len(Pseudomonas_downstream_sequence_data), 2)]

Pseudomonas_downstream_rows = [Pseudomonas_downstream_dinucleotides[i:i+50] for i in range(0, len(Pseudomonas_downstream_dinucleotides), 50)]

Pseudomonas_downstream_df = pd.DataFrame(Pseudomonas_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Pseudomonas_downstream_df.columns = column_names
Pseudomonas_downstream_df['label'] = 0

Pseudomonas = pd.concat([Pseudomonas_df, Pseudomonas_downstream_df], ignore_index=True)
X_Pseudo = Pseudomonas.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Pseudo = pipeline.predict(X_Pseudo)

# Predictions
y_true_Pseudo = Pseudomonas['label']

# Calculate evaluation metrics for "Pseudomonas" data
accuracy_Pseudo = accuracy_score(y_true_Pseudo, y_pred_Pseudo)
precision_Pseudo = precision_score(y_true_Pseudo, y_pred_Pseudo)
recall_Pseudo = recall_score(y_true_Pseudo, y_pred_Pseudo)
f1_score_Pseudo = f1_score(y_true_Pseudo, y_pred_Pseudo)
mcc_Pseudo = matthews_corrcoef(y_true_Pseudo, y_pred_Pseudo)
tn_Pseudo, fp_Pseudo, fn_Pseudo, tp_Pseudo = confusion_matrix(y_true_Pseudo, y_pred_Pseudo).ravel()
specificity_Pseudo = tn_Pseudo / (tn_Pseudo + fp_Pseudo)

# Print the results for "Pseudomonas" data
print("Accuracy on Pseudomonas Data:", accuracy_Pseudo)
print("Precision on Pseudomonas Data:", precision_Pseudo)
print("Recall on Pseudomonas Data:", recall_Pseudo)
print("F1 Score on Pseudomonas Data:", f1_score_Pseudo)
print("MCC on Pseudomonas Data:", mcc_Pseudo)
print("Specificity on Pseudomonas Data:", specificity_Pseudo)


Accuracy on Pseudomonas Data: 0.8860204578665368
Precision on Pseudomonas Data: 0.9794313369630974
Recall on Pseudomonas Data: 0.7886020457866537
F1 Score on Pseudomonas Data: 0.8737182946573124
MCC on Pseudomonas Data: 0.7871256513028005
Specificity on Pseudomonas Data: 0.9834388699464198


# Streptomyces

In [47]:
Streptomyces_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Streptomyces coelicolor.txt"
Streptomyces_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Streptomyces coelicolor.txt"

# Promoter seq data transformation
with open(Streptomyces_file_path, "r") as Streptomyces_file:
    Streptomyces_sequence_data = Streptomyces_file.read()

Streptomyces_sequence_data = Streptomyces_sequence_data.replace('\n', '')

Streptomyces_dinucleotides = [Streptomyces_sequence_data[i:i+2] for i in range(0, len(Streptomyces_sequence_data), 2)]

Streptomyces_rows = [Streptomyces_dinucleotides[i:i+50] for i in range(0, len(Streptomyces_dinucleotides), 50)]

Streptomyces_df = pd.DataFrame(Streptomyces_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Streptomyces_df.columns = column_names
Streptomyces_df['label'] = 1

# Downstream data transformation:
with open(Streptomyces_downstream_file_path, "r") as Streptomyces_downstream_file:
    Streptomyces_downstream_sequence_data = Streptomyces_downstream_file.read()

Streptomyces_downstream_sequence_data = Streptomyces_downstream_sequence_data.replace('\n', '')

Streptomyces_downstream_dinucleotides = [Streptomyces_downstream_sequence_data[i:i+2] for i in range(0, len(Streptomyces_downstream_sequence_data), 2)]

Streptomyces_downstream_rows = [Streptomyces_downstream_dinucleotides[i:i+50] for i in range(0, len(Streptomyces_downstream_dinucleotides), 50)]

Streptomyces_downstream_df = pd.DataFrame(Streptomyces_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Streptomyces_downstream_df.columns = column_names
Streptomyces_downstream_df['label'] = 0

Streptomyces = pd.concat([Streptomyces_df, Streptomyces_downstream_df], ignore_index=True)
X_Strep = Streptomyces.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Strep = pipeline.predict(X_Strep)

# Predictions
y_true_Strep = Streptomyces['label']

# Calculate evaluation metrics for "Streptomyces" data
accuracy_Strep = accuracy_score(y_true_Strep, y_pred_Strep)
precision_Strep = precision_score(y_true_Strep, y_pred_Strep)
recall_Strep = recall_score(y_true_Strep, y_pred_Strep)
f1_score_Strep = f1_score(y_true_Strep, y_pred_Strep)
mcc_Strep = matthews_corrcoef(y_true_Strep, y_pred_Strep)
tn_Strep, fp_Strep, fn_Strep, tp_Strep = confusion_matrix(y_true_Strep, y_pred_Strep).ravel()
specificity_Strep = tn_Strep / (tn_Strep + fp_Strep)

# Print the results for "Streptomyces" data
print("Accuracy on Streptomyces Data:", accuracy_Strep)
print("Precision on Streptomyces Data:", precision_Strep)
print("Recall on Streptomyces Data:", recall_Strep)
print("F1 Score on Streptomyces Data:", f1_score_Strep)
print("MCC on Streptomyces Data:", mcc_Strep)
print("Specificity on Streptomyces Data:", specificity_Strep)


Accuracy on Streptomyces Data: 0.9350823439295289
Precision on Streptomyces Data: 0.9850555081127241
Recall on Streptomyces Data: 0.8835695135963233
F1 Score on Streptomyces Data: 0.931556632344034
MCC on Streptomyces Data: 0.8748198681379868
Specificity on Streptomyces Data: 0.9865951742627346


# Synechocystis

In [48]:
# Define file paths with "Synechocystis" and "Sync" instead of "Bacillus" and "Bac"
Synechocystis_file_path = "D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Synechocystis sp.txt"
Synechocystis_downstream_file_path = "D:\OHE Data\Downstream data\Training Data\Synechocystis sp.txt"

# Promoter seq data transformation
with open(Synechocystis_file_path, "r") as Synechocystis_file:
    Synechocystis_sequence_data = Synechocystis_file.read()

Synechocystis_sequence_data = Synechocystis_sequence_data.replace('\n', '')

Synechocystis_dinucleotides = [Synechocystis_sequence_data[i:i+2] for i in range(0, len(Synechocystis_sequence_data), 2)]

Synechocystis_rows = [Synechocystis_dinucleotides[i:i+50] for i in range(0, len(Synechocystis_dinucleotides), 50)]

Synechocystis_df = pd.DataFrame(Synechocystis_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Synechocystis_df.columns = column_names
Synechocystis_df['label'] = 1

# Downstream data transformation:

with open(Synechocystis_downstream_file_path, "r") as Synechocystis_downstream_file:
    Synechocystis_downstream_sequence_data = Synechocystis_downstream_file.read()

Synechocystis_downstream_sequence_data = Synechocystis_downstream_sequence_data.replace('\n', '')

Synechocystis_downstream_dinucleotides = [Synechocystis_downstream_sequence_data[i:i+2] for i in range(0, len(Synechocystis_downstream_sequence_data), 2)]

Synechocystis_downstream_rows = [Synechocystis_downstream_dinucleotides[i:i+50] for i in range(0, len(Synechocystis_downstream_dinucleotides), 50)]

Synechocystis_downstream_df = pd.DataFrame(Synechocystis_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Synechocystis_downstream_df.columns = column_names
Synechocystis_downstream_df['label'] = 0

Synechocystis = pd.concat([Synechocystis_df, Synechocystis_downstream_df], ignore_index=True)
X_Sync = Synechocystis.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Sync = pipeline.predict(X_Sync)

# Predictions
y_true_Sync = Synechocystis['label']

# Calculate evaluation metrics for "Synechocystis" data
accuracy_Sync = accuracy_score(y_true_Sync, y_pred_Sync)
precision_Sync = precision_score(y_true_Sync, y_pred_Sync)
recall_Sync = recall_score(y_true_Sync, y_pred_Sync)
f1_score_Sync = f1_score(y_true_Sync, y_pred_Sync)
mcc_Sync = matthews_corrcoef(y_true_Sync, y_pred_Sync)
tn_Sync, fp_Sync, fn_Sync, tp_Sync = confusion_matrix(y_true_Sync, y_pred_Sync).ravel()
specificity_Sync = tn_Sync / (tn_Sync + fp_Sync)

# Print the results for "Synechocystis" data
print("Accuracy on Synechocystis Data:", accuracy_Sync)
print("Precision on Synechocystis Data:", precision_Sync)
print("Recall on Synechocystis Data:", recall_Sync)
print("F1 Score on Synechocystis Data:", f1_score_Sync)
print("MCC on Synechocystis Data:", mcc_Sync)
print("Specificity on Synechocystis Data:", specificity_Sync)


Accuracy on Synechocystis Data: 0.9304453935326419
Precision on Synechocystis Data: 0.914756025867137
Recall on Synechocystis Data: 0.949359365466748
F1 Score on Synechocystis Data: 0.9317365269461079
MCC on Synechocystis Data: 0.8615073961707943
Specificity on Synechocystis Data: 0.9115314215985357


# Klebsiella

In [49]:
Klebsiella_file_path = "D:\OHE Data\Promoter data\Test Data\Sequences_80-20_sequences Klebsiella pneumoniae.txt"
Klebsiella_downstream_file_path = "D:\OHE Data\Downstream data\Test Data\Klebsiella pneumoniae.txt"

# Promoter seq data transformation
with open(Klebsiella_file_path, "r") as Klebsiella_file:
    Klebsiella_sequence_data = Klebsiella_file.read()

Klebsiella_sequence_data = Klebsiella_sequence_data.replace('\n', '')

Klebsiella_dinucleotides = [Klebsiella_sequence_data[i:i+2] for i in range(0, len(Klebsiella_sequence_data), 2)]

Klebsiella_rows = [Klebsiella_dinucleotides[i:i+50] for i in range(0, len(Klebsiella_dinucleotides), 50)]

Klebsiella_df = pd.DataFrame(Klebsiella_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Klebsiella_df.columns = column_names
Klebsiella_df['label'] = 1

# Downstream data transformation

with open(Klebsiella_downstream_file_path, "r") as Klebsiella_downstream_file:
    Klebsiella_downstream_sequence_data = Klebsiella_downstream_file.read()

Klebsiella_downstream_sequence_data = Klebsiella_downstream_sequence_data.replace('\n', '')

Klebsiella_downstream_dinucleotides = [Klebsiella_downstream_sequence_data[i:i+2] for i in range(0, len(Klebsiella_downstream_sequence_data), 2)]

Klebsiella_downstream_rows = [Klebsiella_downstream_dinucleotides[i:i+50] for i in range(0, len(Klebsiella_downstream_dinucleotides), 50)]

Klebsiella_downstream_df = pd.DataFrame(Klebsiella_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Klebsiella_downstream_df.columns = column_names
Klebsiella_downstream_df['label'] = 0

Klebsiella = pd.concat([Klebsiella_df, Klebsiella_downstream_df], ignore_index=True)
X_Kleb = Klebsiella.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Kleb = pipeline.predict(X_Kleb)

# Predictions
y_true_Kleb = Klebsiella['label']

# Calculate evaluation metrics for "Klebsiella" data
accuracy_Kleb = accuracy_score(y_true_Kleb, y_pred_Kleb)
precision_Kleb = precision_score(y_true_Kleb, y_pred_Kleb)
recall_Kleb = recall_score(y_true_Kleb, y_pred_Kleb)
f1_score_Kleb = f1_score(y_true_Kleb, y_pred_Kleb)
mcc_Kleb = matthews_corrcoef(y_true_Kleb, y_pred_Kleb)
tn_Kleb, fp_Kleb, fn_Kleb, tp_Kleb = confusion_matrix(y_true_Kleb, y_pred_Kleb).ravel()
specificity_Kleb = tn_Kleb / (tn_Kleb + fp_Kleb)

# Print the results for "Klebsiella" data
print("Accuracy on Klebsiella Data:", accuracy_Kleb)
print("Precision on Klebsiella Data:", precision_Kleb)
print("Recall on Klebsiella Data:", recall_Kleb)
print("F1 Score on Klebsiella Data:", f1_score_Kleb)
print("MCC on Klebsiella Data:", mcc_Kleb)
print("Specificity on Klebsiella Data:", specificity_Kleb)


Accuracy on Klebsiella Data: 0.9161023325808879
Precision on Klebsiella Data: 0.9072164948453608
Recall on Klebsiella Data: 0.927012791572611
F1 Score on Klebsiella Data: 0.9170078154075177
MCC on Klebsiella Data: 0.8324028640942197
Specificity on Klebsiella Data: 0.9051918735891648


# Salmonella

In [50]:
Salmonella_file_path = "D:\OHE Data\Promoter data\Test Data\Sequences_80-20_sequences Salmonella enterica.txt"
Salmonella_downstream_file_path = "D:\OHE Data\Downstream data\Test Data\Salmonella enterica.txt"

# Promoter seq data transformation
with open(Salmonella_file_path, "r") as Salmonella_file:
    Salmonella_sequence_data = Salmonella_file.read()

Salmonella_sequence_data = Salmonella_sequence_data.replace('\n', '')

Salmonella_dinucleotides = [Salmonella_sequence_data[i:i+2] for i in range(0, len(Salmonella_sequence_data), 2)]

Salmonella_rows = [Salmonella_dinucleotides[i:i+50] for i in range(0, len(Salmonella_dinucleotides), 50)]

Salmonella_df = pd.DataFrame(Salmonella_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Salmonella_df.columns = column_names
Salmonella_df['label'] = 1

# Downstream data transformation

with open(Salmonella_downstream_file_path, "r") as Salmonella_downstream_file:
    Salmonella_downstream_sequence_data = Salmonella_downstream_file.read()

Salmonella_downstream_sequence_data = Salmonella_downstream_sequence_data.replace('\n', '')

Salmonella_downstream_dinucleotides = [Salmonella_downstream_sequence_data[i:i+2] for i in range(0, len(Salmonella_downstream_sequence_data), 2)]

Salmonella_downstream_rows = [Salmonella_downstream_dinucleotides[i:i+50] for i in range(0, len(Salmonella_downstream_dinucleotides), 50)]

Salmonella_downstream_df = pd.DataFrame(Salmonella_downstream_rows)
column_names = [str(i) for i in range(-80, 20, 2)]
Salmonella_downstream_df.columns = column_names
Salmonella_downstream_df['label'] = 0

Salmonella = pd.concat([Salmonella_df, Salmonella_downstream_df], ignore_index=True)
X_Salm = Salmonella.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Salm = pipeline.predict(X_Salm)

# Predictions
y_true_Salm = Salmonella['label']

# Calculate evaluation metrics for "Salmonella" data
accuracy_Salm = accuracy_score(y_true_Salm, y_pred_Salm)
precision_Salm = precision_score(y_true_Salm, y_pred_Salm)
recall_Salm = recall_score(y_true_Salm, y_pred_Salm)
f1_score_Salm = f1_score(y_true_Salm, y_pred_Salm)
mcc_Salm = matthews_corrcoef(y_true_Salm, y_pred_Salm)
tn_Salm, fp_Salm, fn_Salm, tp_Salm = confusion_matrix(y_true_Salm, y_pred_Salm).ravel()
specificity_Salm = tn_Salm / (tn_Salm + fp_Salm)

# Print the results for "Salmonella" data
print("Accuracy on Salmonella Data:", accuracy_Salm)
print("Precision on Salmonella Data:", precision_Salm)
print("Recall on Salmonella Data:", recall_Salm)
print("F1 Score on Salmonella Data:", f1_score_Salm)
print("MCC on Salmonella Data:", mcc_Salm)
print("Specificity on Salmonella Data:", specificity_Salm)


Accuracy on Salmonella Data: 0.910295616717635
Precision on Salmonella Data: 0.8737233054781801
Recall on Salmonella Data: 0.9592252803261978
F1 Score on Salmonella Data: 0.914480077745384
MCC on Salmonella Data: 0.8245488558088431
Specificity on Salmonella Data: 0.8613659531090724


## Antisense Promoters with Downstream

In [8]:
# Define the directory where your .txt files are located
antisense_d = "D:\Other promoters\Sequence 80-20\Antisense"

# Initialize an empty list to store DataFrames from each file
dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(antisense_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(antisense_d, filename), "r") as file:
            antisense_data = file.read()

        num_columns = 100

        # Filter out newline characters from the antisense data
        antisense_data = antisense_data.replace('\n', '')

        # Split the antisense data into dinucleotides
        dinucleotides = [antisense_data[i:i+2] for i in range(0, len(antisense_data), 2)]

        # Split dinucleotides into rows of 50 columns
        rows = [dinucleotides[i:i+50] for i in range(0, len(dinucleotides), 50)]

        # Create a DataFrame for the current file
        df = pd.DataFrame(rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        df.columns = column_names

        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
antisense_df = pd.concat(dfs, ignore_index=True)

# Now, antisense_df contains the data with dinucleotides in each row and columns starting from -80
antisense_df


Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,TA,GG,AT,TC,TT,TC,CG,TA,AA,TG,...,GT,CC,CC,AG,AT,CC,AA,AA,GA,AG
1,GA,AG,CG,AT,AA,GA,AG,TC,GC,AG,...,AA,GC,TA,AG,GT,AA,GC,AT,CC,AA
2,TT,CA,TG,AT,AT,AG,AA,CC,TA,GA,...,TG,TC,CC,TT,CG,AG,CT,CG,TA,AA
3,TC,TA,GA,AG,CT,GC,TC,CG,TC,TT,...,TG,TA,AG,GT,CA,TG,GA,GA,AA,TG
4,AG,GT,AA,AT,AC,CC,GC,AC,GA,CT,...,CC,TC,AT,GG,AT,CA,GG,CC,CT,CT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,GA,CA,AA,AA,CT,TG,AT,TG,CA,GA,...,TA,CC,AT,TG,AG,CA,AG,AA,GG,GA
6080,GT,GA,AT,TT,CC,CC,TT,CC,AG,AG,...,TA,TA,GG,TG,CG,GA,TT,TT,CT,CC
6081,AG,CT,CA,TA,GA,AG,TA,GG,GC,AA,...,CA,GA,GC,CA,AC,AA,AG,GA,AA,GA
6082,CA,CA,TA,GT,TC,AG,GG,CA,GA,CC,...,GG,TT,TA,GC,AA,CG,AC,CA,GT,AA


In [10]:
antisense_df['label']=1
antisense_df

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,TA,GG,AT,TC,TT,TC,CG,TA,AA,TG,...,CC,CC,AG,AT,CC,AA,AA,GA,AG,1
1,GA,AG,CG,AT,AA,GA,AG,TC,GC,AG,...,GC,TA,AG,GT,AA,GC,AT,CC,AA,1
2,TT,CA,TG,AT,AT,AG,AA,CC,TA,GA,...,TC,CC,TT,CG,AG,CT,CG,TA,AA,1
3,TC,TA,GA,AG,CT,GC,TC,CG,TC,TT,...,TA,AG,GT,CA,TG,GA,GA,AA,TG,1
4,AG,GT,AA,AT,AC,CC,GC,AC,GA,CT,...,TC,AT,GG,AT,CA,GG,CC,CT,CT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,GA,CA,AA,AA,CT,TG,AT,TG,CA,GA,...,CC,AT,TG,AG,CA,AG,AA,GG,GA,1
6080,GT,GA,AT,TT,CC,CC,TT,CC,AG,AG,...,TA,GG,TG,CG,GA,TT,TT,CT,CC,1
6081,AG,CT,CA,TA,GA,AG,TA,GG,GC,AA,...,GA,GC,CA,AC,AA,AG,GA,AA,GA,1
6082,CA,CA,TA,GT,TC,AG,GG,CA,GA,CC,...,TT,TA,GC,AA,CG,AC,CA,GT,AA,1


In [9]:
downstream_antisense = "D:\Other promoters\Downstream sequences\Antisense"

# Initialize an empty list to store DataFrames from each file
downstream_antisense_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_antisense):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_antisense, filename), "r") as file:
            downstream_antisense_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_antisense_data = downstream_antisense_data.replace('\n', '')

        # Split the sequence data into dinucleotides
        downstream_antisense_dinucleotides = [downstream_antisense_data[i:i+2] for i in range(0, len(downstream_antisense_data), 2)]

        # Split dinucleotides into rows of 50 columns
        downstream_antisense_rows = [downstream_antisense_dinucleotides[i:i+50] for i in range(0, len(downstream_antisense_dinucleotides), 50)]

        # Create a DataFrame for the current file
        downstream_antisense_df = pd.DataFrame(downstream_antisense_rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        downstream_antisense_df.columns = column_names

        # Append the DataFrame to the list
        downstream_antisense_dfs.append(downstream_antisense_df)

# Concatenate all DataFrames into one
downstream_antisense_dataframe = pd.concat(downstream_antisense_dfs, ignore_index=True)

# Now, combined_df contains the data with dinucleotides in each row and columns starting from -80
downstream_antisense_dataframe


Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,TC,GA,CA,AT,CT,CT,CT,CC,AG,AA,...,CT,TG,CT,CC,CG,TA,TC,GT,TG,GT
1,AA,CA,AT,GA,TG,GG,GA,GA,AT,GG,...,TG,AT,TG,GC,AG,AA,CA,AT,AA,AG
2,AA,AG,TC,CC,TG,AT,AT,AG,GG,CC,...,GA,GC,CA,TT,GT,TT,AG,GG,TT,GC
3,GT,GT,CG,GG,AG,AG,AG,AG,AT,TT,...,TG,CA,CG,GG,CG,AT,CG,GC,GC,CA
4,AT,CT,TT,CA,GT,TT,CC,CT,AA,AC,...,AA,AT,TT,GC,TT,TG,CT,TT,TT,GC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,CG,TT,TC,CC,CT,TC,AA,AG,CG,GA,...,AA,TA,TT,CC,TG,TT,CA,TC,GT,CC
6080,TT,CT,TC,GG,TG,CA,AA,AA,AT,CC,...,CA,CC,GG,CT,CC,AC,CG,GA,GC,GA
6081,CT,GA,TG,GG,GA,GC,TT,GA,TT,TT,...,AA,AG,TT,AA,TC,AC,TA,AC,AG,CC
6082,AA,GA,TT,GC,GG,GG,CG,GC,GC,CG,...,CT,AT,GT,AA,GA,GG,GA,CT,TT,AG


In [11]:
downstream_antisense_dataframe['label']=0
downstream_antisense_dataframe

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,TC,GA,CA,AT,CT,CT,CT,CC,AG,AA,...,TG,CT,CC,CG,TA,TC,GT,TG,GT,0
1,AA,CA,AT,GA,TG,GG,GA,GA,AT,GG,...,AT,TG,GC,AG,AA,CA,AT,AA,AG,0
2,AA,AG,TC,CC,TG,AT,AT,AG,GG,CC,...,GC,CA,TT,GT,TT,AG,GG,TT,GC,0
3,GT,GT,CG,GG,AG,AG,AG,AG,AT,TT,...,CA,CG,GG,CG,AT,CG,GC,GC,CA,0
4,AT,CT,TT,CA,GT,TT,CC,CT,AA,AC,...,AT,TT,GC,TT,TG,CT,TT,TT,GC,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,CG,TT,TC,CC,CT,TC,AA,AG,CG,GA,...,TA,TT,CC,TG,TT,CA,TC,GT,CC,0
6080,TT,CT,TC,GG,TG,CA,AA,AA,AT,CC,...,CC,GG,CT,CC,AC,CG,GA,GC,GA,0
6081,CT,GA,TG,GG,GA,GC,TT,GA,TT,TT,...,AG,TT,AA,TC,AC,TA,AC,AG,CC,0
6082,AA,GA,TT,GC,GG,GG,CG,GC,GC,CG,...,AT,GT,AA,GA,GG,GA,CT,TT,AG,0


In [12]:
antisense = pd.concat([antisense_df, downstream_antisense_dataframe], ignore_index=True)
antisense

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,TA,GG,AT,TC,TT,TC,CG,TA,AA,TG,...,CC,CC,AG,AT,CC,AA,AA,GA,AG,1
1,GA,AG,CG,AT,AA,GA,AG,TC,GC,AG,...,GC,TA,AG,GT,AA,GC,AT,CC,AA,1
2,TT,CA,TG,AT,AT,AG,AA,CC,TA,GA,...,TC,CC,TT,CG,AG,CT,CG,TA,AA,1
3,TC,TA,GA,AG,CT,GC,TC,CG,TC,TT,...,TA,AG,GT,CA,TG,GA,GA,AA,TG,1
4,AG,GT,AA,AT,AC,CC,GC,AC,GA,CT,...,TC,AT,GG,AT,CA,GG,CC,CT,CT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12163,CG,TT,TC,CC,CT,TC,AA,AG,CG,GA,...,TA,TT,CC,TG,TT,CA,TC,GT,CC,0
12164,TT,CT,TC,GG,TG,CA,AA,AA,AT,CC,...,CC,GG,CT,CC,AC,CG,GA,GC,GA,0
12165,CT,GA,TG,GG,GA,GC,TT,GA,TT,TT,...,AG,TT,AA,TC,AC,TA,AC,AG,CC,0
12166,AA,GA,TT,GC,GG,GG,CG,GC,GC,CG,...,AT,GT,AA,GA,GG,GA,CT,TT,AG,0


In [13]:
X_anti = antisense.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_anti = pipeline.predict(X_anti)

# Predictions
y_true_anti = antisense['label']

# Calculate evaluation metrics for "anitisense" data
accuracy_anti = accuracy_score(y_true_anti, y_pred_anti)
precision_anti = precision_score(y_true_anti, y_pred_anti)
recall_anti = recall_score(y_true_anti, y_pred_anti)
f1_score_anti = f1_score(y_true_anti, y_pred_anti)
mcc_anti = matthews_corrcoef(y_true_anti, y_pred_anti)
tn_anti, fp_anti, fn_anti, tp_anti = confusion_matrix(y_true_anti, y_pred_anti).ravel()
specificity_anti = tn_anti / (tn_anti + fp_anti)

# Print the results for "anitisense" data
print("Accuracy on anitisense Data:", accuracy_anti)
print("Precision on anitisense Data:", precision_anti)
print("Recall on anitisense Data:", recall_anti)
print("F1 Score on anitisense Data:", f1_score_anti)
print("MCC on anitisense Data:", mcc_anti)
print("Specificity on anitisense Data:", specificity_anti)


Accuracy on anitisense Data: 0.8141847468770546
Precision on anitisense Data: 0.8058889422307569
Recall on anitisense Data: 0.8277449046679816
F1 Score on anitisense Data: 0.8166707208302927
MCC on anitisense Data: 0.6286007078072542
Specificity on anitisense Data: 0.8006245890861275


## Internal promoter with downstream

In [15]:
# Define the directory where your .txt files are located
internal_d = "D:\Other promoters\Sequence 80-20\Induced"

# Initialize an empty list to store DataFrames from each file
dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(internal_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(internal_d, filename), "r") as file:
            internal_data = file.read()

        num_columns = 100

        # Filter out newline characters from the internal data
        internal_data = internal_data.replace('\n', '')

        # Split the internal data into dinucleotides
        dinucleotides = [internal_data[i:i+2] for i in range(0, len(internal_data), 2)]

        # Split dinucleotides into rows of 50 columns
        rows = [dinucleotides[i:i+50] for i in range(0, len(dinucleotides), 50)]

        # Create a DataFrame for the current file
        df = pd.DataFrame(rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        df.columns = column_names

        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
internal_df = pd.concat(dfs, ignore_index=True)

# Now, internal_df contains the data with dinucleotides in each row and columns starting from -80
internal_df

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,CT,CG,TA,CG,GG,AA,GC,TA,CG,AA,...,TA,TG,GA,AG,CT,AA,GA,AA,AT,CA
1,AT,CA,AA,TT,TA,CT,CT,GA,AG,AA,...,AA,CT,AT,TG,CG,CA,GC,TT,AT,GC
2,GG,GC,CA,TG,CA,GT,AT,AG,TG,CT,...,CA,AC,AG,AT,TC,AG,CA,AT,TG,CT
3,AC,AA,CT,GA,TT,AT,CA,TC,AA,TC,...,TT,CA,CA,GC,TA,AA,AC,GA,AA,TC
4,CT,CT,TT,AA,AT,CC,GA,AA,AA,TG,...,TA,GA,CA,GC,TT,AC,AA,GA,AA,AT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,GC,TG,GC,CA,CC,GG,CG,AC,AT,CG,...,CC,TC,GA,CC,GG,CA,AG,AA,CG,TC
6061,GG,AG,GC,GA,TC,TC,CC,GC,TC,GC,...,AG,AC,GC,TG,TC,CC,GC,AG,AC,TC
6062,CT,GG,GC,AT,GT,TC,CT,CG,CC,GC,...,GG,GC,CC,GA,TC,CC,GG,GC,GC,GA
6063,AC,TC,AC,GC,TC,GC,CG,GC,AC,GG,...,AG,TA,CT,TC,AC,CA,TC,GT,AC,CG


In [16]:
internal_df['label'] = 1
internal_df

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,CT,CG,TA,CG,GG,AA,GC,TA,CG,AA,...,TG,GA,AG,CT,AA,GA,AA,AT,CA,1
1,AT,CA,AA,TT,TA,CT,CT,GA,AG,AA,...,CT,AT,TG,CG,CA,GC,TT,AT,GC,1
2,GG,GC,CA,TG,CA,GT,AT,AG,TG,CT,...,AC,AG,AT,TC,AG,CA,AT,TG,CT,1
3,AC,AA,CT,GA,TT,AT,CA,TC,AA,TC,...,CA,CA,GC,TA,AA,AC,GA,AA,TC,1
4,CT,CT,TT,AA,AT,CC,GA,AA,AA,TG,...,GA,CA,GC,TT,AC,AA,GA,AA,AT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,GC,TG,GC,CA,CC,GG,CG,AC,AT,CG,...,TC,GA,CC,GG,CA,AG,AA,CG,TC,1
6061,GG,AG,GC,GA,TC,TC,CC,GC,TC,GC,...,AC,GC,TG,TC,CC,GC,AG,AC,TC,1
6062,CT,GG,GC,AT,GT,TC,CT,CG,CC,GC,...,GC,CC,GA,TC,CC,GG,GC,GC,GA,1
6063,AC,TC,AC,GC,TC,GC,CG,GC,AC,GG,...,TA,CT,TC,AC,CA,TC,GT,AC,CG,1


In [18]:
downstream_internal = "D:\Other promoters\Downstream sequences\Induced"

# Initialize an empty list to store DataFrames from each file
downstream_internal_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_internal):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_internal, filename), "r") as file:
            downstream_internal_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_internal_data = downstream_internal_data.replace('\n', '')

        # Split the sequence data into dinucleotides
        downstream_internal_dinucleotides = [downstream_internal_data[i:i+2] for i in range(0, len(downstream_internal_data), 2)]

        # Split dinucleotides into rows of 50 columns
        downstream_internal_rows = [downstream_internal_dinucleotides[i:i+50] for i in range(0, len(downstream_internal_dinucleotides), 50)]

        # Create a DataFrame for the current file
        downstream_internal_df = pd.DataFrame(downstream_internal_rows)

        # Modify the column names to start from -80 and increment by 2
        column_names = [str(i) for i in range(-80, 20, 2)]
        downstream_internal_df.columns = column_names

        # Append the DataFrame to the list
        downstream_internal_dfs.append(downstream_internal_df)

# Concatenate all DataFrames into one
downstream_internal_dataframe = pd.concat(downstream_internal_dfs, ignore_index=True)

# Now, combined_df contains the data with dinucleotides in each row and columns starting from -80
downstream_internal_dataframe

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,0,2,4,6,8,10,12,14,16,18
0,AA,AG,AA,GA,GC,GC,GT,AG,AA,AC,...,AT,CT,TC,CT,GC,AT,CA,TT,CT,AT
1,CC,AC,AA,CC,GT,GA,CA,GC,AG,TA,...,GG,TG,GA,GA,TT,TC,CC,TG,CC,TT
2,CC,GC,TT,TT,TT,AG,AT,TT,AA,GT,...,TC,CC,TG,TG,AT,GA,TC,TT,TA,TG
3,TC,TC,TT,TA,CG,GG,GA,GG,GA,AA,...,TT,AG,GA,TT,AT,CC,GA,TT,CA,GA
4,AA,TT,GT,TG,AC,CT,TA,TG,AA,TC,...,GT,AT,TT,GT,GA,AT,CT,CG,TA,TT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,GG,CC,CA,CG,GC,GT,CG,AG,GC,GG,...,TG,GC,CG,CC,GC,CA,AC,TC,CC,GG
6061,CC,GA,CG,CA,AG,CT,GC,TG,GC,CA,...,CC,GC,CG,CA,GC,GG,CG,AA,CC,GC
6062,AA,GA,AG,AC,CG,AC,AA,GC,TG,CC,...,CC,GC,CT,GG,AA,CA,CG,CC,AC,CA
6063,GC,CG,AA,GA,CC,AG,TG,CC,GC,CT,...,CC,TG,GC,TG,GT,GC,GG,AT,CG,GC


In [19]:
downstream_internal_dataframe['label'] = 0
downstream_internal_dataframe

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,AA,AG,AA,GA,GC,GC,GT,AG,AA,AC,...,CT,TC,CT,GC,AT,CA,TT,CT,AT,0
1,CC,AC,AA,CC,GT,GA,CA,GC,AG,TA,...,TG,GA,GA,TT,TC,CC,TG,CC,TT,0
2,CC,GC,TT,TT,TT,AG,AT,TT,AA,GT,...,CC,TG,TG,AT,GA,TC,TT,TA,TG,0
3,TC,TC,TT,TA,CG,GG,GA,GG,GA,AA,...,AG,GA,TT,AT,CC,GA,TT,CA,GA,0
4,AA,TT,GT,TG,AC,CT,TA,TG,AA,TC,...,AT,TT,GT,GA,AT,CT,CG,TA,TT,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,GG,CC,CA,CG,GC,GT,CG,AG,GC,GG,...,GC,CG,CC,GC,CA,AC,TC,CC,GG,0
6061,CC,GA,CG,CA,AG,CT,GC,TG,GC,CA,...,GC,CG,CA,GC,GG,CG,AA,CC,GC,0
6062,AA,GA,AG,AC,CG,AC,AA,GC,TG,CC,...,GC,CT,GG,AA,CA,CG,CC,AC,CA,0
6063,GC,CG,AA,GA,CC,AG,TG,CC,GC,CT,...,TG,GC,TG,GT,GC,GG,AT,CG,GC,0


In [20]:
internal = pd.concat([internal_df, downstream_internal_dataframe], ignore_index=True)
internal

Unnamed: 0,-80,-78,-76,-74,-72,-70,-68,-66,-64,-62,...,2,4,6,8,10,12,14,16,18,label
0,CT,CG,TA,CG,GG,AA,GC,TA,CG,AA,...,TG,GA,AG,CT,AA,GA,AA,AT,CA,1
1,AT,CA,AA,TT,TA,CT,CT,GA,AG,AA,...,CT,AT,TG,CG,CA,GC,TT,AT,GC,1
2,GG,GC,CA,TG,CA,GT,AT,AG,TG,CT,...,AC,AG,AT,TC,AG,CA,AT,TG,CT,1
3,AC,AA,CT,GA,TT,AT,CA,TC,AA,TC,...,CA,CA,GC,TA,AA,AC,GA,AA,TC,1
4,CT,CT,TT,AA,AT,CC,GA,AA,AA,TG,...,GA,CA,GC,TT,AC,AA,GA,AA,AT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12125,GG,CC,CA,CG,GC,GT,CG,AG,GC,GG,...,GC,CG,CC,GC,CA,AC,TC,CC,GG,0
12126,CC,GA,CG,CA,AG,CT,GC,TG,GC,CA,...,GC,CG,CA,GC,GG,CG,AA,CC,GC,0
12127,AA,GA,AG,AC,CG,AC,AA,GC,TG,CC,...,GC,CT,GG,AA,CA,CG,CC,AC,CA,0
12128,GC,CG,AA,GA,CC,AG,TG,CC,GC,CT,...,TG,GC,TG,GT,GC,GG,AT,CG,GC,0


In [21]:
X_inter = internal.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_inter = pipeline.predict(X_inter)

# Predictions
y_true_inter = internal['label']

# Calculate evaluation metrics for "internal" data
accuracy_inter = accuracy_score(y_true_inter, y_pred_inter)
precision_inter = precision_score(y_true_inter, y_pred_inter)
recall_inter = recall_score(y_true_inter, y_pred_inter)
f1_score_inter = f1_score(y_true_inter, y_pred_inter)
mcc_inter = matthews_corrcoef(y_true_inter, y_pred_inter)
tn_inter, fp_inter, fn_inter, tp_inter = confusion_matrix(y_true_inter, y_pred_inter).ravel()
specificity_inter = tn_inter / (tn_inter + fp_inter)

# Print the results for "internal" data
print("Accuracy on internal Data:", accuracy_inter)
print("Precision on internal Data:", precision_inter)
print("Recall on internal Data:", recall_inter)
print("F1 Score on internal Data:", f1_score_inter)
print("MCC on internal Data:", mcc_inter)
print("Specificity on internal Data:", specificity_inter)

Accuracy on internal Data: 0.8113767518549052
Precision on internal Data: 0.8271262774987008
Recall on internal Data: 0.7873042044517725
F1 Score on internal Data: 0.8067241088021626
MCC on internal Data: 0.6234765166737768
Specificity on internal Data: 0.8354492992580379
