In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score



# Train Organism Promoter Data

In [2]:
import os
import pandas as pd

# Define the directory where your .txt files are located
directory = "D:\OHE Data\Promoter data\Training Data"

# Initialize an empty list to store DataFrames from each file
dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(directory, filename), "r") as file:
            sequence_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        sequence_data = sequence_data.replace('\n', '')

        data = [list(sequence_data[i:i+num_columns]) for i in range(0, len(sequence_data), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        df = pd.DataFrame(data, columns=column_names)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one DataFrame
combined_df

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,C,A,T,T,T,C,G,C,C,A,...,G,C,G,A,G,C,A,A,G,A
1,T,A,A,T,T,G,C,A,T,G,...,C,A,A,A,C,G,A,A,T,T
2,C,T,T,T,C,A,C,T,C,T,...,T,G,T,T,A,T,A,A,T,G
3,A,A,A,C,G,C,G,C,A,A,...,T,T,A,C,T,T,A,A,T,G
4,G,A,C,T,T,A,A,T,A,A,...,C,A,A,T,A,A,G,C,G,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,A,C,T,G,C,A,C,G,G,C,...,G,A,C,T,G,A,T,A,T,T
14665,T,C,A,T,T,C,A,T,T,T,...,T,C,C,T,T,G,C,C,C,A
14666,T,G,G,A,A,A,A,A,A,G,...,G,T,G,A,A,G,G,G,T,C
14667,T,T,C,A,C,T,T,G,C,T,...,A,G,G,T,A,G,G,T,G,A


In [3]:
combined_df['label']=1
combined_df

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,C,A,T,T,T,C,G,C,C,A,...,C,G,A,G,C,A,A,G,A,1
1,T,A,A,T,T,G,C,A,T,G,...,A,A,A,C,G,A,A,T,T,1
2,C,T,T,T,C,A,C,T,C,T,...,G,T,T,A,T,A,A,T,G,1
3,A,A,A,C,G,C,G,C,A,A,...,T,A,C,T,T,A,A,T,G,1
4,G,A,C,T,T,A,A,T,A,A,...,A,A,T,A,A,G,C,G,C,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,A,C,T,G,C,A,C,G,G,C,...,A,C,T,G,A,T,A,T,T,1
14665,T,C,A,T,T,C,A,T,T,T,...,C,C,T,T,G,C,C,C,A,1
14666,T,G,G,A,A,A,A,A,A,G,...,T,G,A,A,G,G,G,T,C,1
14667,T,T,C,A,C,T,T,G,C,T,...,G,G,T,A,G,G,T,G,A,1


# Train Organism Downstream Data

In [4]:

downstream_directory = "D:\OHE Data\Downstream data\Training Data"

# Initialize an empty list to store DataFrames from each file
downstream_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_directory):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_directory, filename), "r") as file:
            downstream_sequence_data = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_sequence_data = downstream_sequence_data.replace('\n', '')

        downstream_data = [list(downstream_sequence_data[i:i+num_columns]) for i in range(0, len(downstream_sequence_data), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        downstream_df = pd.DataFrame(downstream_data, columns=column_names)
        
        # Append the DataFrame to the list
        downstream_dfs.append(downstream_df)

# Concatenate all DataFrames into one
downstream_dataframe = pd.concat(downstream_dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one 
downstream_dataframe

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,A,G,T,A,T,C,A,T,T,T,...,G,C,C,G,G,G,A,A,G,T
1,G,G,A,G,A,A,G,G,A,A,...,G,C,C,G,C,A,T,A,C,G
2,T,G,G,A,A,G,A,C,A,A,...,A,A,C,A,G,C,T,T,G,A
3,G,T,G,A,A,T,T,T,G,T,...,G,C,T,G,C,T,T,T,A,A
4,A,A,A,G,A,C,G,C,T,C,...,T,C,T,G,A,T,G,A,T,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,A,T,T,G,C,T,G,G,T,T,...,G,A,G,T,T,G,C,C,C,A
14665,G,A,A,G,T,C,A,C,C,A,...,T,T,A,C,T,C,C,T,A,C
14666,T,T,T,T,T,G,G,C,G,T,...,T,A,G,A,C,A,G,T,T,G
14667,C,G,A,A,T,G,G,A,A,G,...,A,T,T,T,T,T,A,C,A,C


In [5]:
downstream_dataframe['label']=0
downstream_dataframe

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,A,G,T,A,T,C,A,T,T,T,...,C,C,G,G,G,A,A,G,T,0
1,G,G,A,G,A,A,G,G,A,A,...,C,C,G,C,A,T,A,C,G,0
2,T,G,G,A,A,G,A,C,A,A,...,A,C,A,G,C,T,T,G,A,0
3,G,T,G,A,A,T,T,T,G,T,...,C,T,G,C,T,T,T,A,A,0
4,A,A,A,G,A,C,G,C,T,C,...,C,T,G,A,T,G,A,T,A,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14664,A,T,T,G,C,T,G,G,T,T,...,A,G,T,T,G,C,C,C,A,0
14665,G,A,A,G,T,C,A,C,C,A,...,T,A,C,T,C,C,T,A,C,0
14666,T,T,T,T,T,G,G,C,G,T,...,A,G,A,C,A,G,T,T,G,0
14667,C,G,A,A,T,G,G,A,A,G,...,T,T,T,T,T,A,C,A,C,0


In [6]:
# Concatenate the two DataFrames vertically
data = pd.concat([combined_df, downstream_dataframe], ignore_index=True)
data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,C,A,T,T,T,C,G,C,C,A,...,C,G,A,G,C,A,A,G,A,1
1,T,A,A,T,T,G,C,A,T,G,...,A,A,A,C,G,A,A,T,T,1
2,C,T,T,T,C,A,C,T,C,T,...,G,T,T,A,T,A,A,T,G,1
3,A,A,A,C,G,C,G,C,A,A,...,T,A,C,T,T,A,A,T,G,1
4,G,A,C,T,T,A,A,T,A,A,...,A,A,T,A,A,G,C,G,C,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29333,A,T,T,G,C,T,G,G,T,T,...,A,G,T,T,G,C,C,C,A,0
29334,G,A,A,G,T,C,A,C,C,A,...,T,A,C,T,C,C,T,A,C,0
29335,T,T,T,T,T,G,G,C,G,T,...,A,G,A,C,A,G,T,T,G,0
29336,C,G,A,A,T,G,G,A,A,G,...,T,T,T,T,T,A,C,A,C,0


# Model Training

In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Define features and target
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

# Create a pipeline with encoding and model
pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse=False, dtype=int)),
    ('model', XGBClassifier(
        colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_depth=None, min_child_weight=5,
        n_estimators=300, subsample=1.0, n_jobs=-1, random_state=101
    ))
])

# Create KFold cross-validation iterator
kf = KFold(n_splits=10, shuffle=True, random_state=101)

# Initialize lists to store evaluation results
accuracy_list = []
precision_list = []
recall_list = []
f1_score_list = []
mcc_list = []
specificity_list = []

# Perform KFold cross-validation
fold_no = 1
for train_idx, test_idx in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Fit the pipeline on the training fold
    pipeline.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred = pipeline.predict(X_val_fold)

    # Calculate metrics and store in lists
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)
    mcc = matthews_corrcoef(y_val_fold, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_pred).ravel()
    specificity = tn / (tn + fp)

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1)
    mcc_list.append(mcc)
    specificity_list.append(specificity)

    # Print the results for this fold
    print("Fold {}: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, MCC: {:.4f}, Specificity: {:.4f}".format(
        fold_no, accuracy, precision, recall, f1, mcc, specificity))

    fold_no += 1

# Calculate and print the mean and standard deviation of each metric
print("Mean Accuracy: {:.4f}, Std Accuracy: {:.4f}".format(np.mean(accuracy_list), np.std(accuracy_list)))
print("Mean Precision: {:.4f}, Std Precision: {:.4f}".format(np.mean(precision_list), np.std(precision_list)))
print("Mean Recall: {:.4f}, Std Recall: {:.4f}".format(np.mean(recall_list), np.std(recall_list)))
print("Mean F1 Score: {:.4f}, Std F1 Score: {:.4f}".format(np.mean(f1_score_list), np.std(f1_score_list)))
print("Mean MCC: {:.4f}, Std MCC: {:.4f}".format(np.mean(mcc_list), np.std(mcc_list)))
print("Mean Specificity: {:.4f}, Std Specificity: {:.4f}".format(np.mean(specificity_list), np.std(specificity_list)))


Fold 1: Accuracy: 0.8966, Precision: 0.9230, Recall: 0.8721, F1 Score: 0.8968, MCC: 0.7947, Specificity: 0.9227
Fold 2: Accuracy: 0.8936, Precision: 0.9096, Recall: 0.8696, F1 Score: 0.8892, MCC: 0.7877, Specificity: 0.9167
Fold 3: Accuracy: 0.8898, Precision: 0.9098, Recall: 0.8683, F1 Score: 0.8885, MCC: 0.7806, Specificity: 0.9119
Fold 4: Accuracy: 0.8822, Precision: 0.9031, Recall: 0.8558, F1 Score: 0.8788, MCC: 0.7655, Specificity: 0.9085
Fold 5: Accuracy: 0.8943, Precision: 0.9110, Recall: 0.8745, F1 Score: 0.8924, MCC: 0.7893, Specificity: 0.9142
Fold 6: Accuracy: 0.8981, Precision: 0.9203, Recall: 0.8757, F1 Score: 0.8974, MCC: 0.7973, Specificity: 0.9213
Fold 7: Accuracy: 0.8890, Precision: 0.9082, Recall: 0.8696, F1 Score: 0.8885, MCC: 0.7789, Specificity: 0.9091
Fold 8: Accuracy: 0.8966, Precision: 0.9035, Recall: 0.8802, F1 Score: 0.8917, MCC: 0.7930, Specificity: 0.9120
Fold 9: Accuracy: 0.8833, Precision: 0.9042, Recall: 0.8559, F1 Score: 0.8794, MCC: 0.7677, Specificity:

In [8]:
import pickle

# Save the trained model as a pickle file
with open('ohe_mononucleotide.pickle', 'wb') as file:
    pickle.dump(pipeline, file)


# Bacillus

In [10]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequence_Bacillus_amyloliquefaciens.txt", "r") as file:
    Bacillus_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Training Data\Bacillus amyloliquefaciens.txt", "r") as file:
    Bacillus_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Bacillus_seq = Bacillus_seq.replace('\n', '')

Bacillus_data = [list(Bacillus_seq[i:i+num_columns]) for i in range(0, len(Bacillus_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Bacillus_df = pd.DataFrame(Bacillus_data, columns=column_names)

Bacillus_df['label'] = 1


num_columns = 100

# Filter out newline characters from the sequence data
Bacillus_downstream_seq = Bacillus_downstream_seq.replace('\n', '')

Bacillus_downstream_data = [list(Bacillus_downstream_seq[i:i+num_columns]) for i in range(0, len(Bacillus_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Bacillus_downstream_df = pd.DataFrame(Bacillus_downstream_data, columns=column_names)

Bacillus_downstream_df['label'] = 0

Bacillus = pd.concat([Bacillus_df, Bacillus_downstream_df], ignore_index=True)

# Predictions:

X_Bac = Bacillus.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Bac = pipeline.predict(X_Bac)

y_true_Bac = Bacillus['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Bac, y_pred_Bac)

# Calculate precision
precision_new = precision_score(y_true_Bac, y_pred_Bac)

# Calculate recall
recall_new = recall_score(y_true_Bac, y_pred_Bac)

# Calculate F1 score
f1_score_new = f1_score(y_true_Bac, y_pred_Bac)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Bac, y_pred_Bac)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Bac, y_pred_Bac).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Bacillus Data:", accuracy_new)
print("Precision on Bacillus Data:", precision_new)
print("Recall on Bacillus Data:", recall_new)
print("F1 Score on Bacillus Data:", f1_score_new)
print("MCC on Bacillus Data:", mcc_new)
print("Specificity on Bacillus Data:", specificity_new)


Accuracy on Bacillus Data: 0.9216241737488197
Precision on Bacillus Data: 0.9724867724867725
Recall on Bacillus Data: 0.8677998111425873
F1 Score on Bacillus Data: 0.9171656686626746
MCC on Bacillus Data: 0.8481771114151446
Specificity on Bacillus Data: 0.9754485363550519


# Chlamydia

In [12]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Chlamydia pneumoniae.txt", "r") as file:
    Chlamydia_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Training Data\Chlamydia pneumoniae.txt", "r") as file:
    Chlamydia_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Chlamydia_seq = Chlamydia_seq.replace('\n', '')

Chlamydia_data = [list(Chlamydia_seq[i:i+num_columns]) for i in range(0, len(Chlamydia_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Chlamydia_df = pd.DataFrame(Chlamydia_data, columns=column_names)

Chlamydia_df['label'] = 1


num_columns = 100

# Filter out newline characters from the sequence data
Chlamydia_downstream_seq = Chlamydia_downstream_seq.replace('\n', '')

Chlamydia_downstream_data = [list(Chlamydia_downstream_seq[i:i+num_columns]) for i in range(0, len(Chlamydia_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Chlamydia_downstream_df = pd.DataFrame(Chlamydia_downstream_data, columns=column_names)

Chlamydia_downstream_df['label'] = 0

Chlamydia = pd.concat([Chlamydia_df, Chlamydia_downstream_df], ignore_index=True)

# Predictions:

X_Chlm = Chlamydia.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Chlm = pipeline.predict(X_Chlm)

y_true_Chlm = Chlamydia['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Chlm, y_pred_Chlm)

# Calculate precision
precision_new = precision_score(y_true_Chlm, y_pred_Chlm)

# Calculate recall
recall_new = recall_score(y_true_Chlm, y_pred_Chlm)

# Calculate F1 score
f1_score_new = f1_score(y_true_Chlm, y_pred_Chlm)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Chlm, y_pred_Chlm)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Chlm, y_pred_Chlm).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Chlamydia Data:", accuracy_new)
print("Precision on Chlamydia Data:", precision_new)
print("Recall on Chlamydia Data:", recall_new)
print("F1 Score on Chlamydia Data:", f1_score_new)
print("MCC on Chlamydia Data:", mcc_new)
print("Specificity on Chlamydia Data:", specificity_new)


Accuracy on Chlamydia Data: 0.9542079207920792
Precision on Chlamydia Data: 0.9379474940334129
Recall on Chlamydia Data: 0.9727722772277227
F1 Score on Chlamydia Data: 0.9550425273390036
MCC on Chlamydia Data: 0.9090426340896282
Specificity on Chlamydia Data: 0.9356435643564357


# Corynebacterium

In [13]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Corynebacterium glutamicum.txt", "r") as file:
    Corynebacterium_seq = file.read()

with open("D:\OHE Data\Downstream data\Training Data\Corynebacterium glutamicum.txt", "r") as file:
    Corynebacterium_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Corynebacterium_seq = Corynebacterium_seq.replace('\n', '')

Corynebacterium_data = [list(Corynebacterium_seq[i:i+num_columns]) for i in range(0, len(Corynebacterium_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Corynebacterium_df = pd.DataFrame(Corynebacterium_data, columns=column_names)

Corynebacterium_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Corynebacterium_downstream_seq = Corynebacterium_downstream_seq.replace('\n', '')

Corynebacterium_downstream_data = [list(Corynebacterium_downstream_seq[i:i+num_columns]) for i in range(0, len(Corynebacterium_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Corynebacterium_downstream_df = pd.DataFrame(Corynebacterium_downstream_data, columns=column_names)

Corynebacterium_downstream_df['label'] = 0

Corynebacterium = pd.concat([Corynebacterium_df, Corynebacterium_downstream_df], ignore_index=True)

# Predictions:

X_Corn = Corynebacterium.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Corn = pipeline.predict(X_Corn)

y_true_Corn = Corynebacterium['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Corn, y_pred_Corn)

# Calculate precision
precision_new = precision_score(y_true_Corn, y_pred_Corn)

# Calculate recall
recall_new = recall_score(y_true_Corn, y_pred_Corn)

# Calculate F1 score
f1_score_new = f1_score(y_true_Corn, y_pred_Corn)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Corn, y_pred_Corn)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Corn, y_pred_Corn).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Corynebacterium Data:", accuracy_new)
print("Precision on Corynebacterium Data:", precision_new)
print("Recall on Corynebacterium Data:", recall_new)
print("F1 Score on Corynebacterium Data:", f1_score_new)
print("MCC on Corynebacterium Data:", mcc_new)
print("Specificity on Corynebacterium Data:", specificity_new)


Accuracy on Corynebacterium Data: 0.9852173913043478
Precision on Corynebacterium Data: 0.984375
Recall on Corynebacterium Data: 0.9860869565217392
F1 Score on Corynebacterium Data: 0.9852302345786274
MCC on Corynebacterium Data: 0.9704362501882555
Specificity on Corynebacterium Data: 0.9843478260869565


# E.coli

In [14]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences E.coli.txt", "r") as file:
    Ecoli_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Training Data\E.coli.txt", "r") as file:
    Ecoli_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Ecoli_seq = Ecoli_seq.replace('\n', '')

Ecoli_data = [list(Ecoli_seq[i:i+num_columns]) for i in range(0, len(Ecoli_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Ecoli_df = pd.DataFrame(Ecoli_data, columns=column_names)

Ecoli_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Ecoli_downstream_seq = Ecoli_downstream_seq.replace('\n', '')

Ecoli_downstream_data = [list(Ecoli_downstream_seq[i:i+num_columns]) for i in range(0, len(Ecoli_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Ecoli_downstream_df = pd.DataFrame(Ecoli_downstream_data, columns=column_names)

Ecoli_downstream_df['label'] = 0

Ecoli = pd.concat([Ecoli_df, Ecoli_downstream_df], ignore_index=True)

# Predictions:

X_Ecoli = Ecoli.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Ecoli = pipeline.predict(X_Ecoli)

y_true_Ecoli = Ecoli['label']

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Ecoli, y_pred_Ecoli)

# Calculate precision
precision_new = precision_score(y_true_Ecoli, y_pred_Ecoli)

# Calculate recall
recall_new = recall_score(y_true_Ecoli, y_pred_Ecoli)

# Calculate F1 score
f1_score_new = f1_score(y_true_Ecoli, y_pred_Ecoli)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Ecoli, y_pred_Ecoli)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Ecoli, y_pred_Ecoli).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Ecoli Data:", accuracy_new)
print("Precision on Ecoli Data:", precision_new)
print("Recall on Ecoli Data:", recall_new)
print("F1 Score on Ecoli Data:", f1_score_new)
print("MCC on Ecoli Data:", mcc_new)
print("Specificity on Ecoli Data:", specificity_new)


Accuracy on Ecoli Data: 0.9726181545386346
Precision on Ecoli Data: 0.9701492537313433
Recall on Ecoli Data: 0.9752438109527382
F1 Score on Ecoli Data: 0.9726898615787505
MCC on Ecoli Data: 0.9452493424004246
Specificity on Ecoli Data: 0.9699924981245311


# Helicobactoer

In [16]:
with open(r"D:\OHE Data\Promoter data\Training Data\any except ATGC Helicobacter_pylori.txt", "r") as file:
    Helico_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Training Data\Helicobacter pylori.txt", "r") as file:
    Helico_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Helico_seq = Helico_seq.replace('\n', '')

Helico_data = [list(Helico_seq[i:i+num_columns]) for i in range(0, len(Helico_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Helico_df = pd.DataFrame(Helico_data, columns=column_names)

Helico_df['label'] = 1


num_columns = 100

# Filter out newline characters from the sequence data
Helico_downstream_seq = Helico_downstream_seq.replace('\n', '')

Helico_downstream_data = [list(Helico_downstream_seq[i:i+num_columns]) for i in range(0, len(Helico_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Helico_downstream_df = pd.DataFrame(Helico_downstream_data, columns=column_names)

Helico_downstream_df['label'] = 0

Helico = pd.concat([Helico_df, Helico_downstream_df], ignore_index=True)

# Predictions:

X_Helico = Helico.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Helico = pipeline.predict(X_Helico)

y_true_Helico = Helico['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Helico, y_pred_Helico)

# Calculate precision
precision_new = precision_score(y_true_Helico, y_pred_Helico)

# Calculate recall
recall_new = recall_score(y_true_Helico, y_pred_Helico)

# Calculate F1 score
f1_score_new = f1_score(y_true_Helico, y_pred_Helico)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Helico, y_pred_Helico)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Helico, y_pred_Helico).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Helicobacter Data:", accuracy_new)
print("Precision on Helicobacter Data:", precision_new)
print("Recall on Helicobacter Data:", recall_new)
print("F1 Score on Helicobacter Data:", f1_score_new)
print("MCC on Helicobacter Data:", mcc_new)
print("Specificity on Helicobacter Data:", specificity_new)


Accuracy on Helicobacter Data: 0.976628895184136
Precision on Helicobacter Data: 0.960328317373461
Recall on Helicobacter Data: 0.9943342776203966
F1 Score on Helicobacter Data: 0.9770354906054279
MCC on Helicobacter Data: 0.9538560086021217
Specificity on Helicobacter Data: 0.9589235127478754


# Mycobacterium

In [17]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Mycobacterium tuberculosis.txt", "r") as file:
    Mycobacterium_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Training Data\Mycobacterium tuberculosis.txt", "r") as file:
    Mycobacterium_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Mycobacterium_seq = Mycobacterium_seq.replace('\n', '')

Mycobacterium_data = [list(Mycobacterium_seq[i:i+num_columns]) for i in range(0, len(Mycobacterium_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Mycobacterium_df = pd.DataFrame(Mycobacterium_data, columns=column_names)

Mycobacterium_df['label'] = 1


num_columns = 100

# Filter out newline characters from the sequence data
Mycobacterium_downstream_seq = Mycobacterium_downstream_seq.replace('\n', '')

Mycobacterium_downstream_data = [list(Mycobacterium_downstream_seq[i:i+num_columns]) for i in range(0, len(Mycobacterium_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Mycobacterium_downstream_df = pd.DataFrame(Mycobacterium_downstream_data, columns=column_names)

Mycobacterium_downstream_df['label'] = 0

Mycobacterium = pd.concat([Mycobacterium_df, Mycobacterium_downstream_df], ignore_index=True)

# Predictions:

X_Myco = Mycobacterium.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Myco = pipeline.predict(X_Myco)

y_true_Myco = Mycobacterium['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Myco, y_pred_Myco)

# Calculate precision
precision_new = precision_score(y_true_Myco, y_pred_Myco)

# Calculate recall
recall_new = recall_score(y_true_Myco, y_pred_Myco)

# Calculate F1 score
f1_score_new = f1_score(y_true_Myco, y_pred_Myco)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Myco, y_pred_Myco)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Myco, y_pred_Myco).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Mycobacterium Data:", accuracy_new)
print("Precision on Mycobacterium Data:", precision_new)
print("Recall on Mycobacterium Data:", recall_new)
print("F1 Score on Mycobacterium Data:", f1_score_new)
print("MCC on Mycobacterium Data:", mcc_new)
print("Specificity on Mycobacterium Data:", specificity_new)


Accuracy on Mycobacterium Data: 0.981941309255079
Precision on Mycobacterium Data: 0.9913693901035673
Recall on Mycobacterium Data: 0.9723476297968398
F1 Score on Mycobacterium Data: 0.9817663817663818
MCC on Mycobacterium Data: 0.9640600964946423
Specificity on Mycobacterium Data: 0.9915349887133182


# Nostoc

In [19]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Nostoc sp.txt", "r") as file:
    Nostoc_seq = file.read()

with open(r"D:\OHE Data\Downstream data\Training Data\Nostoc sp.txt", "r") as file:
    Nostoc_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Nostoc_seq = Nostoc_seq.replace('\n', '')

Nostoc_data = [list(Nostoc_seq[i:i+num_columns]) for i in range(0, len(Nostoc_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Nostoc_df = pd.DataFrame(Nostoc_data, columns=column_names)

Nostoc_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Nostoc_downstream_seq = Nostoc_downstream_seq.replace('\n', '')

Nostoc_downstream_data = [list(Nostoc_downstream_seq[i:i+num_columns]) for i in range(0, len(Nostoc_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Nostoc_downstream_df = pd.DataFrame(Nostoc_downstream_data, columns=column_names)

Nostoc_downstream_df['label'] = 0

Nostoc = pd.concat([Nostoc_df, Nostoc_downstream_df], ignore_index=True)

# Predictions:

X_Nost = Nostoc.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Nost = pipeline.predict(X_Nost)

y_true_Nost = Nostoc['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Nost, y_pred_Nost)

# Calculate precision
precision_new = precision_score(y_true_Nost, y_pred_Nost)

# Calculate recall
recall_new = recall_score(y_true_Nost, y_pred_Nost)

# Calculate F1 score
f1_score_new = f1_score(y_true_Nost, y_pred_Nost)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Nost, y_pred_Nost)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Nost, y_pred_Nost).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Nostoc Data:", accuracy_new)
print("Precision on Nostoc Data:", precision_new)
print("Recall on Nostoc Data:", recall_new)
print("F1 Score on Nostoc Data:", f1_score_new)
print("MCC on Nostoc Data:", mcc_new)
print("Specificity on Nostoc Data:", specificity_new)


Accuracy on Nostoc Data: 0.9751688518077076
Precision on Nostoc Data: 0.9624903325599381
Recall on Nostoc Data: 0.988875645609853
F1 Score on Nostoc Data: 0.9755046051342349
MCC on Nostoc Data: 0.9506949966745913
Specificity on Nostoc Data: 0.9614620580055622


# Pseudomonas

In [21]:
with open(r"D:\OHE Data\Promoter data\Training Data\any except ATGC Pseudomonas aeruginosa.txt", "r") as file:
    Pseudomonas_seq = file.read()

with open("D:\OHE Data\Downstream data\Training Data\Pseudomonas aeruginosa.txt", "r") as file:
    Pseudomonas_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Pseudomonas_seq = Pseudomonas_seq.replace('\n', '')

Pseudomonas_data = [list(Pseudomonas_seq[i:i+num_columns]) for i in range(0, len(Pseudomonas_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Pseudomonas_df = pd.DataFrame(Pseudomonas_data, columns=column_names)

Pseudomonas_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Pseudomonas_downstream_seq = Pseudomonas_downstream_seq.replace('\n', '')

Pseudomonas_downstream_data = [list(Pseudomonas_downstream_seq[i:i+num_columns]) for i in range(0, len(Pseudomonas_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Pseudomonas_downstream_df = pd.DataFrame(Pseudomonas_downstream_data, columns=column_names)

Pseudomonas_downstream_df['label'] = 0

Pseudomonas = pd.concat([Pseudomonas_df, Pseudomonas_downstream_df], ignore_index=True)

# Predictions:

X_Pseudo = Pseudomonas.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Pseudo = pipeline.predict(X_Pseudo)

y_true_Pseudo = Pseudomonas['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Pseudo, y_pred_Pseudo)

# Calculate precision
precision_new = precision_score(y_true_Pseudo, y_pred_Pseudo)

# Calculate recall
recall_new = recall_score(y_true_Pseudo, y_pred_Pseudo)

# Calculate F1 score
f1_score_new = f1_score(y_true_Pseudo, y_pred_Pseudo)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Pseudo, y_pred_Pseudo)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Pseudo, y_pred_Pseudo).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Pseudomonas Data:", accuracy_new)
print("Precision on Pseudomonas Data:", precision_new)
print("Recall on Pseudomonas Data:", recall_new)
print("F1 Score on Pseudomonas Data:", f1_score_new)
print("MCC on Pseudomonas Data:", mcc_new)
print("Specificity on Pseudomonas Data:", specificity_new)


Accuracy on Pseudomonas Data: 0.9622503653190453
Precision on Pseudomonas Data: 0.9932432432432432
Recall on Pseudomonas Data: 0.930832927423283
F1 Score on Pseudomonas Data: 0.9610258989187831
MCC on Pseudomonas Data: 0.9263312196915968
Specificity on Pseudomonas Data: 0.9936678032148076


# Streptomyces

In [22]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Streptomyces coelicolor.txt", "r") as file:
    Streptomyces_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Training Data\Streptomyces coelicolor.txt", "r") as file:
    Streptomyces_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Streptomyces_seq = Streptomyces_seq.replace('\n', '')

Streptomyces_data = [list(Streptomyces_seq[i:i+num_columns]) for i in range(0, len(Streptomyces_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Streptomyces_df = pd.DataFrame(Streptomyces_data, columns=column_names)

Streptomyces_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Streptomyces_downstream_seq = Streptomyces_downstream_seq.replace('\n', '')

Streptomyces_downstream_data = [list(Streptomyces_downstream_seq[i:i+num_columns]) for i in range(0, len(Streptomyces_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Streptomyces_downstream_df = pd.DataFrame(Streptomyces_downstream_data, columns=column_names)

Streptomyces_downstream_df['label'] = 0

Streptomyces = pd.concat([Streptomyces_df, Streptomyces_downstream_df], ignore_index=True)

# Predictions:

X_Strep = Streptomyces.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Strep = pipeline.predict(X_Strep)

y_true_Strep = Streptomyces['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Strep, y_pred_Strep)

# Calculate precision
precision_new = precision_score(y_true_Strep, y_pred_Strep)

# Calculate recall
recall_new = recall_score(y_true_Strep, y_pred_Strep)

# Calculate F1 score
f1_score_new = f1_score(y_true_Strep, y_pred_Strep)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Strep, y_pred_Strep)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Strep, y_pred_Strep).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Streptomyces Data:", accuracy_new)
print("Precision on Streptomyces Data:", precision_new)
print("Recall on Streptomyces Data:", recall_new)
print("F1 Score on Streptomyces Data:", f1_score_new)
print("MCC on Streptomyces Data:", mcc_new)
print("Specificity on Streptomyces Data:", specificity_new)


Accuracy on Streptomyces Data: 0.976254308693987
Precision on Streptomyces Data: 0.9968038353975229
Recall on Streptomyces Data: 0.9555725775564917
F1 Score on Streptomyces Data: 0.9757528353539303
MCC on Streptomyces Data: 0.9533245051334843
Specificity on Streptomyces Data: 0.9969360398314822


# Synechocystis

In [23]:
with open("D:\OHE Data\Promoter data\Training Data\Sequences_80-20_sequences Synechocystis sp.txt", "r") as file:
    Synechocystis_seq = file.read()

with open("D:\OHE Data\Downstream data\Training Data\Synechocystis sp.txt", "r") as file:
    Synechocystis_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Synechocystis_seq = Synechocystis_seq.replace('\n', '')

Synechocystis_data = [list(Synechocystis_seq[i:i+num_columns]) for i in range(0, len(Synechocystis_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Synechocystis_df = pd.DataFrame(Synechocystis_data, columns=column_names)

Synechocystis_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Synechocystis_downstream_seq = Synechocystis_downstream_seq.replace('\n', '')

Synechocystis_downstream_data = [list(Synechocystis_downstream_seq[i:i+num_columns]) for i in range(0, len(Synechocystis_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Synechocystis_downstream_df = pd.DataFrame(Synechocystis_downstream_data, columns=column_names)

Synechocystis_downstream_df['label'] = 0

Synechocystis = pd.concat([Synechocystis_df, Synechocystis_downstream_df], ignore_index=True)

# Predictions:

X_Sync = Synechocystis.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Sync = pipeline.predict(X_Sync)

y_true_Sync = Synechocystis['label']

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Sync, y_pred_Sync)

# Calculate precision
precision_new = precision_score(y_true_Sync, y_pred_Sync)

# Calculate recall
recall_new = recall_score(y_true_Sync, y_pred_Sync)

# Calculate F1 score
f1_score_new = f1_score(y_true_Sync, y_pred_Sync)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Sync, y_pred_Sync)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Sync, y_pred_Sync).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Synechocystis Data:", accuracy_new)
print("Precision on Synechocystis Data:", precision_new)
print("Recall on Synechocystis Data:", recall_new)
print("F1 Score on Synechocystis Data:", f1_score_new)
print("MCC on Synechocystis Data:", mcc_new)
print("Specificity on Synechocystis Data:", specificity_new)


Accuracy on Synechocystis Data: 0.9850518608907871
Precision on Synechocystis Data: 0.9812348668280871
Recall on Synechocystis Data: 0.9890176937156803
F1 Score on Synechocystis Data: 0.9851109085384382
MCC on Synechocystis Data: 0.9701342384742931
Specificity on Synechocystis Data: 0.9810860280658938


# Klebsiella

In [24]:
with open("D:\OHE Data\Promoter data\Test Data\Sequences_80-20_sequences Klebsiella pneumoniae.txt", "r") as file:
    Klebsiella_seq = file.read()
    
with open("D:\OHE Data\Downstream data\Test Data\Klebsiella pneumoniae.txt", "r") as file:
    Klebsiella_downstream_seq = file.read()
    
num_columns = 100

# Filter out newline characters from the sequence data
Klebsiella_seq = Klebsiella_seq.replace('\n', '')

Klebsiella_data = [list(Klebsiella_seq[i:i+num_columns]) for i in range(0, len(Klebsiella_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Klebsiella_df = pd.DataFrame(Klebsiella_data, columns=column_names)

Klebsiella_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Klebsiella_downstream_seq = Klebsiella_downstream_seq.replace('\n', '')

Klebsiella_downstream_data = [list(Klebsiella_downstream_seq[i:i+num_columns]) for i in range(0, len(Klebsiella_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Klebsiella_downstream_df = pd.DataFrame(Klebsiella_downstream_data, columns=column_names)

Klebsiella_downstream_df['label'] = 0

Klebsiella = pd.concat([Klebsiella_df, Klebsiella_downstream_df], ignore_index=True)

# Predictions:

X_Kleb = Klebsiella.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Kleb = pipeline.predict(X_Kleb)

y_true_Kleb = Klebsiella['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Kleb, y_pred_Kleb)

# Calculate precision
precision_new = precision_score(y_true_Kleb, y_pred_Kleb)

# Calculate recall
recall_new = recall_score(y_true_Kleb, y_pred_Kleb)

# Calculate F1 score
f1_score_new = f1_score(y_true_Kleb, y_pred_Kleb)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Kleb, y_pred_Kleb)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Kleb, y_pred_Kleb).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Klebsiella Data:", accuracy_new)
print("Precision on Klebsiella Data:", precision_new)
print("Recall on Klebsiella Data:", recall_new)
print("F1 Score on Klebsiella Data:", f1_score_new)
print("MCC on Klebsiella Data:", mcc_new)
print("Specificity on Klebsiella Data:", specificity_new)


Accuracy on Klebsiella Data: 0.9273890142964635
Precision on Klebsiella Data: 0.9264264264264265
Recall on Klebsiella Data: 0.928517682468021
F1 Score on Klebsiella Data: 0.9274708756106728
MCC on Klebsiella Data: 0.8547802063907637
Specificity on Klebsiella Data: 0.9262603461249059


# Salmonella

In [25]:
with open("D:\OHE Data\Promoter data\Test Data\Sequences_80-20_sequences Salmonella enterica.txt", "r") as file:
    Salmonella_seq = file.read()

with open("D:\OHE Data\Downstream data\Test Data\Salmonella enterica.txt", "r") as file:
    Salmonella_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Salmonella_seq = Salmonella_seq.replace('\n', '')

Salmonella_data = [list(Salmonella_seq[i:i+num_columns]) for i in range(0, len(Salmonella_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Salmonella_df = pd.DataFrame(Salmonella_data, columns=column_names)

Salmonella_df['label'] = 1

num_columns = 100

# Filter out newline characters from the sequence data
Salmonella_downstream_seq = Salmonella_downstream_seq.replace('\n', '')

Salmonella_downstream_data = [list(Salmonella_downstream_seq[i:i+num_columns]) for i in range(0, len(Salmonella_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Salmonella_downstream_df = pd.DataFrame(Salmonella_downstream_data, columns=column_names)

Salmonella_downstream_df['label'] = 0

Salmonella = pd.concat([Salmonella_df, Salmonella_downstream_df], ignore_index=True)

# Predictions:

X_Salm = Salmonella.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Salm = pipeline.predict(X_Salm)

y_true_Salm = Salmonella['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Salm, y_pred_Salm)

# Calculate precision
precision_new = precision_score(y_true_Salm, y_pred_Salm)

# Calculate recall
recall_new = recall_score(y_true_Salm, y_pred_Salm)

# Calculate F1 score
f1_score_new = f1_score(y_true_Salm, y_pred_Salm)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Salm, y_pred_Salm)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Salm, y_pred_Salm).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Salmonella Data:", accuracy_new)
print("Precision on Salmonella Data:", precision_new)
print("Recall on Salmonella Data:", recall_new)
print("F1 Score on Salmonella Data:", f1_score_new)
print("MCC on Salmonella Data:", mcc_new)
print("Specificity on Salmonella Data:", specificity_new)


Accuracy on Salmonella Data: 0.9327217125382263
Precision on Salmonella Data: 0.9054441260744985
Recall on Salmonella Data: 0.9663608562691132
F1 Score on Salmonella Data: 0.9349112426035503
MCC on Salmonella Data: 0.8674087571577701
Specificity on Salmonella Data: 0.8990825688073395


# Test Promoter Data

In [15]:
test_d = "D:\OHE Data\Promoter data\Test Data"

# Initialize an empty list to store DataFrames from each file
test_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(test_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(test_d, filename), "r") as file:
            test_sequence = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        test_sequence = test_sequence.replace('\n', '')

        test_data = [list(test_sequence[i:i+num_columns]) for i in range(0, len(test_sequence), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        test_df = pd.DataFrame(test_data, columns=column_names)
        
        # Append the DataFrame to the list
        test_dfs.append(test_df)

# Concatenate all DataFrames into one
test_data = pd.concat(test_dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one 
test_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,T,C,C,A,T,A,T,C,A,G,...,C,C,T,C,T,T,T,G,C,C
1,C,C,T,G,G,A,A,T,G,A,...,G,C,A,A,G,G,A,C,T,G
2,C,G,C,A,G,A,G,T,G,T,...,G,A,C,A,A,G,A,G,G,A
3,T,A,A,T,C,T,G,C,A,T,...,C,A,T,C,C,C,C,T,T,A
4,T,G,C,T,G,T,A,A,T,C,...,G,T,T,T,G,A,A,C,G,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,C,T,T,T,C,T,A,A,A,C,...,C,C,T,G,C,C,A,G,C,G
2306,A,C,G,A,T,C,T,G,A,A,...,G,A,G,T,A,G,C,C,T,G
2307,T,T,A,A,A,T,A,T,C,T,...,G,A,T,G,C,C,A,G,C,A
2308,G,C,C,G,C,C,G,G,A,A,...,T,G,T,T,G,T,A,T,A,A


In [18]:
test_data['label']=1
test_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,C,C,A,T,A,T,C,A,G,...,C,T,C,T,T,T,G,C,C,1
1,C,C,T,G,G,A,A,T,G,A,...,C,A,A,G,G,A,C,T,G,1
2,C,G,C,A,G,A,G,T,G,T,...,A,C,A,A,G,A,G,G,A,1
3,T,A,A,T,C,T,G,C,A,T,...,A,T,C,C,C,C,T,T,A,1
4,T,G,C,T,G,T,A,A,T,C,...,T,T,T,G,A,A,C,G,G,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,C,T,T,T,C,T,A,A,A,C,...,C,T,G,C,C,A,G,C,G,1
2306,A,C,G,A,T,C,T,G,A,A,...,A,G,T,A,G,C,C,T,G,1
2307,T,T,A,A,A,T,A,T,C,T,...,A,T,G,C,C,A,G,C,A,1
2308,G,C,C,G,C,C,G,G,A,A,...,G,T,T,G,T,A,T,A,A,1


# Test Downstream data

In [16]:

downstream_test_d = "D:\OHE Data\Downstream data\Test Data"

# Initialize an empty list to store DataFrames from each file
downstream_test_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_test_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_test_d, filename), "r") as file:
            downstream_test_sequence = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_test_sequence = downstream_test_sequence.replace('\n', '')

        downstream_test_data = [list(downstream_test_sequence[i:i+num_columns]) for i in range(0, len(downstream_test_sequence), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        downstream_test_df = pd.DataFrame(downstream_test_data, columns=column_names)
        
        # Append the DataFrame to the list
        downstream_test_dfs.append(downstream_test_df)

# Concatenate all DataFrames into one
downstream_test_data = pd.concat(downstream_test_dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one 
downstream_test_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,G,G,A,G,G,A,A,A,C,A,...,A,T,T,A,A,C,G,C,C,A
1,C,A,C,C,G,C,A,A,A,T,...,C,G,C,A,A,A,A,T,G,C
2,A,A,T,T,C,G,G,C,C,G,...,T,C,G,G,C,A,G,C,G,G
3,C,T,T,C,C,C,T,G,A,T,...,G,C,G,C,G,C,A,G,C,A
4,C,T,G,C,A,A,A,C,C,C,...,A,C,C,A,C,C,G,G,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,C,T,G,G,G,A,C,T,T,T,...,A,C,G,A,A,T,A,G,T,C
2306,C,C,G,G,C,G,C,G,G,C,...,T,A,A,A,G,A,T,A,T,A
2307,T,A,A,T,G,A,A,T,G,T,...,T,C,A,C,T,C,A,A,C,C
2308,C,G,T,T,C,G,A,C,T,A,...,G,T,C,A,G,T,T,A,C,T


In [17]:
downstream_test_data['label']=0
downstream_test_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,G,G,A,G,G,A,A,A,C,A,...,T,T,A,A,C,G,C,C,A,0
1,C,A,C,C,G,C,A,A,A,T,...,G,C,A,A,A,A,T,G,C,0
2,A,A,T,T,C,G,G,C,C,G,...,C,G,G,C,A,G,C,G,G,0
3,C,T,T,C,C,C,T,G,A,T,...,C,G,C,G,C,A,G,C,A,0
4,C,T,G,C,A,A,A,C,C,C,...,C,C,A,C,C,G,G,C,G,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,C,T,G,G,G,A,C,T,T,T,...,C,G,A,A,T,A,G,T,C,0
2306,C,C,G,G,C,G,C,G,G,C,...,A,A,A,G,A,T,A,T,A,0
2307,T,A,A,T,G,A,A,T,G,T,...,C,A,C,T,C,A,A,C,C,0
2308,C,G,T,T,C,G,A,C,T,A,...,T,C,A,G,T,T,A,C,T,0


In [19]:
# Concatenate the two DataFrames vertically
test = pd.concat([test_data, downstream_test_data], ignore_index=True)
test

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,C,C,A,T,A,T,C,A,G,...,C,T,C,T,T,T,G,C,C,1
1,C,C,T,G,G,A,A,T,G,A,...,C,A,A,G,G,A,C,T,G,1
2,C,G,C,A,G,A,G,T,G,T,...,A,C,A,A,G,A,G,G,A,1
3,T,A,A,T,C,T,G,C,A,T,...,A,T,C,C,C,C,T,T,A,1
4,T,G,C,T,G,T,A,A,T,C,...,T,T,T,G,A,A,C,G,G,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4615,C,T,G,G,G,A,C,T,T,T,...,C,G,A,A,T,A,G,T,C,0
4616,C,C,G,G,C,G,C,G,G,C,...,A,A,A,G,A,T,A,T,A,0
4617,T,A,A,T,G,A,A,T,G,T,...,C,A,C,T,C,A,A,C,C,0
4618,C,G,T,T,C,G,A,C,T,A,...,T,C,A,G,T,T,A,C,T,0


In [22]:
# Extract features from the new dataset (assuming the features are in the same format as the training data)
X_new = test.drop('label', axis=1)

# Use the same encoder that was fit on the training data to one-hot encode the features
X_encoded_new = encoder.transform(X_new)

# Use the trained XGBoost model for prediction
y_pred_new = pipeline.predict(X_encoded_new)

In [23]:
print(y_pred_new)

[1 1 1 ... 0 0 0]


In [24]:

y_true_new = test['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_new, y_pred_new)

# Calculate precision
precision_new = precision_score(y_true_new, y_pred_new)

# Calculate recall
recall_new = recall_score(y_true_new, y_pred_new)

# Calculate F1 score
f1_score_new = f1_score(y_true_new, y_pred_new)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_new, y_pred_new)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_new, y_pred_new).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on New Data:", accuracy_new)
print("Precision on New Data:", precision_new)
print("Recall on New Data:", recall_new)
print("F1 Score on New Data:", f1_score_new)
print("MCC on New Data:", mcc_new)
print("Specificity on New Data:", specificity_new)


Accuracy on New Data: 0.9300865800865801
Precision on New Data: 0.9207962727657772
Recall on New Data: 0.9411255411255411
F1 Score on New Data: 0.9308499250695782
MCC on New Data: 0.8603828759420669
Specificity on New Data: 0.919047619047619


# Klebsiella

In [18]:
with open("D:\OHE Data\Promoter data\Test Data\Sequences_80-20_sequences Klebsiella pneumoniae.txt", "r") as file:
    Klebsiella_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Klebsiella_seq = Klebsiella_seq.replace('\n', '')

Klebsiella_data = [list(Klebsiella_seq[i:i+num_columns]) for i in range(0, len(Klebsiella_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Klebsiella_df = pd.DataFrame(Klebsiella_data, columns=column_names)
Klebsiella_df


Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,T,C,C,A,T,A,T,C,A,G,...,C,C,T,C,T,T,T,G,C,C
1,C,C,T,G,G,A,A,T,G,A,...,G,C,A,A,G,G,A,C,T,G
2,C,G,C,A,G,A,G,T,G,T,...,G,A,C,A,A,G,A,G,G,A
3,T,A,A,T,C,T,G,C,A,T,...,C,A,T,C,C,C,C,T,T,A
4,T,G,C,T,G,T,A,A,T,C,...,G,T,T,T,G,A,A,C,G,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324,C,C,T,G,A,C,C,T,A,C,...,T,G,T,T,T,A,A,G,G,A
1325,T,C,A,G,G,C,T,T,T,G,...,A,T,G,A,T,T,T,C,A,C
1326,C,A,G,G,T,T,G,T,T,T,...,T,C,A,G,G,T,C,A,A,A
1327,T,A,C,A,G,T,T,T,T,T,...,T,G,G,C,T,A,T,T,C,C


In [19]:
Klebsiella_df['label']=1
Klebsiella_df

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,C,C,A,T,A,T,C,A,G,...,C,T,C,T,T,T,G,C,C,1
1,C,C,T,G,G,A,A,T,G,A,...,C,A,A,G,G,A,C,T,G,1
2,C,G,C,A,G,A,G,T,G,T,...,A,C,A,A,G,A,G,G,A,1
3,T,A,A,T,C,T,G,C,A,T,...,A,T,C,C,C,C,T,T,A,1
4,T,G,C,T,G,T,A,A,T,C,...,T,T,T,G,A,A,C,G,G,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324,C,C,T,G,A,C,C,T,A,C,...,G,T,T,T,A,A,G,G,A,1
1325,T,C,A,G,G,C,T,T,T,G,...,T,G,A,T,T,T,C,A,C,1
1326,C,A,G,G,T,T,G,T,T,T,...,C,A,G,G,T,C,A,A,A,1
1327,T,A,C,A,G,T,T,T,T,T,...,G,G,C,T,A,T,T,C,C,1


In [20]:
with open("D:\OHE Data\Downstream data\Test Data\Klebsiella pneumoniae.txt", "r") as file:
    Klebsiella_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Klebsiella_downstream_seq = Klebsiella_downstream_seq.replace('\n', '')

Klebsiella_downstream_data = [list(Klebsiella_downstream_seq[i:i+num_columns]) for i in range(0, len(Klebsiella_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Klebsiella_downstream_df = pd.DataFrame(Klebsiella_downstream_data, columns=column_names)
Klebsiella_downstream_df


Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,G,G,A,G,G,A,A,A,C,A,...,A,T,T,A,A,C,G,C,C,A
1,C,A,C,C,G,C,A,A,A,T,...,C,G,C,A,A,A,A,T,G,C
2,A,A,T,T,C,G,G,C,C,G,...,T,C,G,G,C,A,G,C,G,G
3,C,T,T,C,C,C,T,G,A,T,...,G,C,G,C,G,C,A,G,C,A
4,C,T,G,C,A,A,A,C,C,C,...,A,C,C,A,C,C,G,G,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324,G,A,A,T,G,T,T,T,A,A,...,C,C,C,G,T,C,C,A,A,T
1325,C,G,G,G,T,C,C,A,G,A,...,C,A,T,G,T,T,A,C,G,C
1326,G,C,A,A,C,A,T,G,A,T,...,T,C,C,A,G,T,G,G,C,T
1327,A,G,C,G,G,T,A,T,T,A,...,G,A,G,C,G,T,T,A,A,T


In [21]:
Klebsiella_downstream_df['label']=0
Klebsiella_downstream_df

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,G,G,A,G,G,A,A,A,C,A,...,T,T,A,A,C,G,C,C,A,0
1,C,A,C,C,G,C,A,A,A,T,...,G,C,A,A,A,A,T,G,C,0
2,A,A,T,T,C,G,G,C,C,G,...,C,G,G,C,A,G,C,G,G,0
3,C,T,T,C,C,C,T,G,A,T,...,C,G,C,G,C,A,G,C,A,0
4,C,T,G,C,A,A,A,C,C,C,...,C,C,A,C,C,G,G,C,G,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324,G,A,A,T,G,T,T,T,A,A,...,C,C,G,T,C,C,A,A,T,0
1325,C,G,G,G,T,C,C,A,G,A,...,A,T,G,T,T,A,C,G,C,0
1326,G,C,A,A,C,A,T,G,A,T,...,C,C,A,G,T,G,G,C,T,0
1327,A,G,C,G,G,T,A,T,T,A,...,A,G,C,G,T,T,A,A,T,0


In [22]:
# Concatenate the two DataFrames vertically
Klebsiella = pd.concat([Klebsiella_df, Klebsiella_downstream_df], ignore_index=True)
Klebsiella

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,C,C,A,T,A,T,C,A,G,...,C,T,C,T,T,T,G,C,C,1
1,C,C,T,G,G,A,A,T,G,A,...,C,A,A,G,G,A,C,T,G,1
2,C,G,C,A,G,A,G,T,G,T,...,A,C,A,A,G,A,G,G,A,1
3,T,A,A,T,C,T,G,C,A,T,...,A,T,C,C,C,C,T,T,A,1
4,T,G,C,T,G,T,A,A,T,C,...,T,T,T,G,A,A,C,G,G,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2653,G,A,A,T,G,T,T,T,A,A,...,C,C,G,T,C,C,A,A,T,0
2654,C,G,G,G,T,C,C,A,G,A,...,A,T,G,T,T,A,C,G,C,0
2655,G,C,A,A,C,A,T,G,A,T,...,C,C,A,G,T,G,G,C,T,0
2656,A,G,C,G,G,T,A,T,T,A,...,A,G,C,G,T,T,A,A,T,0


In [25]:
# Extract features from the new dataset (assuming the features are in the same format as the training data)
X_Kleb = Klebsiella.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Kleb = pipeline.predict(X_Kleb)

In [27]:
y_pred_Kleb

array([1, 1, 1, ..., 0, 1, 0])

In [30]:

y_true_Kleb = Klebsiella['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Kleb, y_pred_Kleb)

# Calculate precision
precision_new = precision_score(y_true_Kleb, y_pred_Kleb)

# Calculate recall
recall_new = recall_score(y_true_Kleb, y_pred_Kleb)

# Calculate F1 score
f1_score_new = f1_score(y_true_Kleb, y_pred_Kleb)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Kleb, y_pred_Kleb)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Kleb, y_pred_Kleb).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Klebsiell Data:", accuracy_new)
print("Precision on Klebsiell Data:", precision_new)
print("Recall on Klebsiell Data:", recall_new)
print("F1 Score on Klebsiell Data:", f1_score_new)
print("MCC on Klebsiell Data:", mcc_new)
print("Specificity on Klebsiella Data:", specificity_new)


Accuracy on Klebsiell Data: 0.9273890142964635
Precision on Klebsiell Data: 0.9264264264264265
Recall on Klebsiell Data: 0.928517682468021
F1 Score on Klebsiell Data: 0.9274708756106728
MCC on Klebsiell Data: 0.8547802063907637
Specificity on Klebsiella Data: 0.9262603461249059


# Salmonella

In [31]:
with open("D:\OHE Data\Promoter data\Test Data\Sequences_80-20_sequences Salmonella enterica.txt", "r") as file:
    Salmonella_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Salmonella_seq = Salmonella_seq.replace('\n', '')

Salmonella_data = [list(Salmonella_seq[i:i+num_columns]) for i in range(0, len(Salmonella_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Salmonella_df = pd.DataFrame(Salmonella_data, columns=column_names)
Salmonella_df


Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,G,C,C,C,T,T,G,T,G,C,...,C,T,G,A,C,A,A,G,G,T
1,G,C,A,G,A,T,A,C,G,C,...,A,C,A,T,C,C,C,C,T,A
2,T,T,A,A,C,G,C,G,T,G,...,G,T,T,T,A,A,A,C,G,G
3,G,G,A,A,A,A,C,C,A,G,...,C,T,G,A,A,T,C,G,T,T
4,T,T,T,T,C,C,A,T,C,G,...,G,G,T,G,A,G,T,C,T,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,C,T,T,T,C,T,A,A,A,C,...,C,C,T,G,C,C,A,G,C,G
977,A,C,G,A,T,C,T,G,A,A,...,G,A,G,T,A,G,C,C,T,G
978,T,T,A,A,A,T,A,T,C,T,...,G,A,T,G,C,C,A,G,C,A
979,G,C,C,G,C,C,G,G,A,A,...,T,G,T,T,G,T,A,T,A,A


In [32]:
Salmonella_df['label']=1
Salmonella_df

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,G,C,C,C,T,T,G,T,G,C,...,T,G,A,C,A,A,G,G,T,1
1,G,C,A,G,A,T,A,C,G,C,...,C,A,T,C,C,C,C,T,A,1
2,T,T,A,A,C,G,C,G,T,G,...,T,T,T,A,A,A,C,G,G,1
3,G,G,A,A,A,A,C,C,A,G,...,T,G,A,A,T,C,G,T,T,1
4,T,T,T,T,C,C,A,T,C,G,...,G,T,G,A,G,T,C,T,G,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,C,T,T,T,C,T,A,A,A,C,...,C,T,G,C,C,A,G,C,G,1
977,A,C,G,A,T,C,T,G,A,A,...,A,G,T,A,G,C,C,T,G,1
978,T,T,A,A,A,T,A,T,C,T,...,A,T,G,C,C,A,G,C,A,1
979,G,C,C,G,C,C,G,G,A,A,...,G,T,T,G,T,A,T,A,A,1


In [33]:
with open("D:\OHE Data\Downstream data\Test Data\Salmonella enterica.txt", "r") as file:
    Salmonella_downstream_seq = file.read()

num_columns = 100

# Filter out newline characters from the sequence data
Salmonella_downstream_seq = Salmonella_downstream_seq.replace('\n', '')

Salmonella_downstream_data = [list(Salmonella_downstream_seq[i:i+num_columns]) for i in range(0, len(Salmonella_downstream_seq), num_columns)]

# Create a list of column names ranging from -80 to 19
column_names = [str(i) for i in range(-80, 20)]

Salmonella_downstream_df = pd.DataFrame(Salmonella_downstream_data, columns=column_names)
Salmonella_downstream_df


Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,G,C,T,C,G,G,C,G,C,A,...,T,A,G,C,C,C,G,C,A,G
1,A,C,C,C,T,T,C,T,C,T,...,A,C,C,G,C,G,C,G,C,A
2,G,T,C,C,A,A,C,G,G,C,...,A,C,C,A,C,T,G,G,C,G
3,G,A,C,C,A,T,A,A,A,A,...,A,T,T,C,A,A,C,T,C,A
4,C,A,A,C,T,C,T,T,G,T,...,T,A,C,C,C,A,G,G,A,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,C,T,G,G,G,A,C,T,T,T,...,A,C,G,A,A,T,A,G,T,C
977,C,C,G,G,C,G,C,G,G,C,...,T,A,A,A,G,A,T,A,T,A
978,T,A,A,T,G,A,A,T,G,T,...,T,C,A,C,T,C,A,A,C,C
979,C,G,T,T,C,G,A,C,T,A,...,G,T,C,A,G,T,T,A,C,T


In [34]:
Salmonella_downstream_df['label']=0
Salmonella_downstream_df

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,G,C,T,C,G,G,C,G,C,A,...,A,G,C,C,C,G,C,A,G,0
1,A,C,C,C,T,T,C,T,C,T,...,C,C,G,C,G,C,G,C,A,0
2,G,T,C,C,A,A,C,G,G,C,...,C,C,A,C,T,G,G,C,G,0
3,G,A,C,C,A,T,A,A,A,A,...,T,T,C,A,A,C,T,C,A,0
4,C,A,A,C,T,C,T,T,G,T,...,A,C,C,C,A,G,G,A,T,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,C,T,G,G,G,A,C,T,T,T,...,C,G,A,A,T,A,G,T,C,0
977,C,C,G,G,C,G,C,G,G,C,...,A,A,A,G,A,T,A,T,A,0
978,T,A,A,T,G,A,A,T,G,T,...,C,A,C,T,C,A,A,C,C,0
979,C,G,T,T,C,G,A,C,T,A,...,T,C,A,G,T,T,A,C,T,0


In [35]:
Salmonella= pd.concat([Salmonella_df, Salmonella_downstream_df], ignore_index=True)
Salmonella

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,G,C,C,C,T,T,G,T,G,C,...,T,G,A,C,A,A,G,G,T,1
1,G,C,A,G,A,T,A,C,G,C,...,C,A,T,C,C,C,C,T,A,1
2,T,T,A,A,C,G,C,G,T,G,...,T,T,T,A,A,A,C,G,G,1
3,G,G,A,A,A,A,C,C,A,G,...,T,G,A,A,T,C,G,T,T,1
4,T,T,T,T,C,C,A,T,C,G,...,G,T,G,A,G,T,C,T,G,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,C,T,G,G,G,A,C,T,T,T,...,C,G,A,A,T,A,G,T,C,0
1958,C,C,G,G,C,G,C,G,G,C,...,A,A,A,G,A,T,A,T,A,0
1959,T,A,A,T,G,A,A,T,G,T,...,C,A,C,T,C,A,A,C,C,0
1960,C,G,T,T,C,G,A,C,T,A,...,T,C,A,G,T,T,A,C,T,0


In [36]:
# Extract features from the new dataset (assuming the features are in the same format as the training data)
X_Sal = Salmonella.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_Sal = pipeline.predict(X_Sal)

In [37]:
y_true_Sal = Salmonella['label']  # Replace 'new_data' with the actual variable containing your new data

# Calculate accuracy
accuracy_new = accuracy_score(y_true_Sal, y_pred_Sal)

# Calculate precision
precision_new = precision_score(y_true_Sal, y_pred_Sal)

# Calculate recall
recall_new = recall_score(y_true_Sal, y_pred_Sal)

# Calculate F1 score
f1_score_new = f1_score(y_true_Sal, y_pred_Sal)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_new = matthews_corrcoef(y_true_Sal, y_pred_Sal)

# Calculate specificity
tn_new, fp_new, fn_new, tp_new = confusion_matrix(y_true_Sal, y_pred_Sal).ravel()
specificity_new = tn_new / (tn_new + fp_new)

# Print the results
print("Accuracy on Salmonella Data:", accuracy_new)
print("Precision on Salmonella Data:", precision_new)
print("Recall on Salmonella Data:", recall_new)
print("F1 Score on Salmonella Data:", f1_score_new)
print("MCC on Salmonella Data:", mcc_new)
print("Specificity on Salmonella Data:", specificity_new)


Accuracy on Salmonella Data: 0.9327217125382263
Precision on Salmonella Data: 0.9054441260744985
Recall on Salmonella Data: 0.9663608562691132
F1 Score on Salmonella Data: 0.9349112426035503
MCC on Salmonella Data: 0.8674087571577701
Specificity on Salmonella Data: 0.8990825688073395


In [None]:
X_promoters= promoter.drop(columns=['label']) #X.loc[:14668]
X_promoters

## Antisense Promoters with Downstream

###### Bacteria Names
- Chlamydia pneumoniae
- Corynebacterium glutamicum
- Helicobactor pylori
- Mycobacterium tuberculosis
- Nostoc sp
- Pseudomonas aeruginosa
- Salmonella enterica
- Streptomyces coelicolor
- Synechocystis sp

In [11]:
antisense_d = "D:\Other promoters\Sequence 80-20\Antisense"

# Initialize an empty list to store DataFrames from each file
antisense_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(antisense_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(antisense_d, filename), "r") as file:
            antisense_sequence = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        antisense_sequence = antisense_sequence.replace('\n', '')

        antisense_data = [list(antisense_sequence[i:i+num_columns]) for i in range(0, len(antisense_sequence), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        antisense_df = pd.DataFrame(antisense_data, columns=column_names)
        
        # Append the DataFrame to the list
        antisense_dfs.append(antisense_df)

# Concatenate all DataFrames into one
antisense_data = pd.concat(antisense_dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one 
antisense_data


Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,T,A,G,G,A,T,T,C,T,T,...,C,C,A,A,A,A,G,A,A,G
1,G,A,A,G,C,G,A,T,A,A,...,A,A,G,C,A,T,C,C,A,A
2,T,T,C,A,T,G,A,T,A,T,...,A,G,C,T,C,G,T,A,A,A
3,T,C,T,A,G,A,A,G,C,T,...,T,G,G,A,G,A,A,A,T,G
4,A,G,G,T,A,A,A,T,A,C,...,C,A,G,G,C,C,C,T,C,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,G,A,C,A,A,A,A,A,C,T,...,C,A,A,G,A,A,G,G,G,A
6080,G,T,G,A,A,T,T,T,C,C,...,G,A,T,T,T,T,C,T,C,C
6081,A,G,C,T,C,A,T,A,G,A,...,A,A,A,G,G,A,A,A,G,A
6082,C,A,C,A,T,A,G,T,T,C,...,C,G,A,C,C,A,G,T,A,A


In [12]:
antisense_data['label']=1
antisense_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,A,G,G,A,T,T,C,T,T,...,C,A,A,A,A,G,A,A,G,1
1,G,A,A,G,C,G,A,T,A,A,...,A,G,C,A,T,C,C,A,A,1
2,T,T,C,A,T,G,A,T,A,T,...,G,C,T,C,G,T,A,A,A,1
3,T,C,T,A,G,A,A,G,C,T,...,G,G,A,G,A,A,A,T,G,1
4,A,G,G,T,A,A,A,T,A,C,...,A,G,G,C,C,C,T,C,T,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,G,A,C,A,A,A,A,A,C,T,...,A,A,G,A,A,G,G,G,A,1
6080,G,T,G,A,A,T,T,T,C,C,...,A,T,T,T,T,C,T,C,C,1
6081,A,G,C,T,C,A,T,A,G,A,...,A,A,G,G,A,A,A,G,A,1
6082,C,A,C,A,T,A,G,T,T,C,...,G,A,C,C,A,G,T,A,A,1


In [15]:
downstream_antisense_d = "D:\Other promoters\Downstream sequences\Antisense" 

# Initialize an empty list to store DataFrames from each file
downstream_antisense_dfs = []  

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_antisense_d):  
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_antisense_d, filename), "r") as file:  
            downstream_antisense_sequence = file.read()  

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_antisense_sequence = downstream_antisense_sequence.replace('\n', '')  

        downstream_antisense_data = [list(downstream_antisense_sequence[i:i+num_columns]) for i in range(0, len(downstream_antisense_sequence), num_columns)]  # Updated variable name

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        downstream_antisense_df = pd.DataFrame(downstream_antisense_data, columns=column_names)  # Updated variable name

        # Append the DataFrame to the list
        downstream_antisense_dfs.append(downstream_antisense_df)  

# Concatenate all DataFrames into one
downstream_antisense_data = pd.concat(downstream_antisense_dfs, ignore_index=True)  

# Now, combined_df contains the data from all .txt files in one 
downstream_antisense_data


Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,T,C,G,A,C,A,A,T,C,T,...,T,A,T,C,G,T,T,G,G,T
1,A,A,C,A,A,T,G,A,T,G,...,A,A,C,A,A,T,A,A,A,G
2,A,A,A,G,T,C,C,C,T,G,...,T,T,A,G,G,G,T,T,G,C
3,G,T,G,T,C,G,G,G,A,G,...,A,T,C,G,G,C,G,C,C,A
4,A,T,C,T,T,T,C,A,G,T,...,T,G,C,T,T,T,T,T,G,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,C,G,T,T,T,C,C,C,C,T,...,T,T,C,A,T,C,G,T,C,C
6080,T,T,C,T,T,C,G,G,T,G,...,A,C,C,G,G,A,G,C,G,A
6081,C,T,G,A,T,G,G,G,G,A,...,A,C,T,A,A,C,A,G,C,C
6082,A,A,G,A,T,T,G,C,G,G,...,G,G,G,A,C,T,T,T,A,G


In [16]:
downstream_antisense_data['label']=0
downstream_antisense_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,C,G,A,C,A,A,T,C,T,...,A,T,C,G,T,T,G,G,T,0
1,A,A,C,A,A,T,G,A,T,G,...,A,C,A,A,T,A,A,A,G,0
2,A,A,A,G,T,C,C,C,T,G,...,T,A,G,G,G,T,T,G,C,0
3,G,T,G,T,C,G,G,G,A,G,...,T,C,G,G,C,G,C,C,A,0
4,A,T,C,T,T,T,C,A,G,T,...,G,C,T,T,T,T,T,G,C,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6079,C,G,T,T,T,C,C,C,C,T,...,T,C,A,T,C,G,T,C,C,0
6080,T,T,C,T,T,C,G,G,T,G,...,C,C,G,G,A,G,C,G,A,0
6081,C,T,G,A,T,G,G,G,G,A,...,C,T,A,A,C,A,G,C,C,0
6082,A,A,G,A,T,T,G,C,G,G,...,G,G,A,C,T,T,T,A,G,0


In [17]:
# Concatenate the two DataFrames vertically
antisense = pd.concat([antisense_data, downstream_antisense_data], ignore_index=True)
antisense

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,T,A,G,G,A,T,T,C,T,T,...,C,A,A,A,A,G,A,A,G,1
1,G,A,A,G,C,G,A,T,A,A,...,A,G,C,A,T,C,C,A,A,1
2,T,T,C,A,T,G,A,T,A,T,...,G,C,T,C,G,T,A,A,A,1
3,T,C,T,A,G,A,A,G,C,T,...,G,G,A,G,A,A,A,T,G,1
4,A,G,G,T,A,A,A,T,A,C,...,A,G,G,C,C,C,T,C,T,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12163,C,G,T,T,T,C,C,C,C,T,...,T,C,A,T,C,G,T,C,C,0
12164,T,T,C,T,T,C,G,G,T,G,...,C,C,G,G,A,G,C,G,A,0
12165,C,T,G,A,T,G,G,G,G,A,...,C,T,A,A,C,A,G,C,C,0
12166,A,A,G,A,T,T,G,C,G,G,...,G,G,A,C,T,T,T,A,G,0


In [20]:
X_anti = antisense.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_antisense = pipeline.predict(X_anti)

In [22]:
y_true_antisense = antisense['label']  

# Calculate accuracy
accuracy_anti = accuracy_score(y_true_antisense, y_pred_antisense) 

# Calculate precision
precision_anti = precision_score(y_true_antisense, y_pred_antisense) 

# Calculate recall
recall_anti = recall_score(y_true_antisense, y_pred_antisense)  

# Calculate F1 score
f1_score_anti = f1_score(y_true_antisense, y_pred_antisense)  

# Calculate MCC (Matthews Correlation Coefficient)
mcc_anti = matthews_corrcoef(y_true_antisense, y_pred_antisense)  

# Calculate specificity
tn_anti, fp_anti, fn_anti, tp_anti = confusion_matrix(y_true_antisense, y_pred_antisense).ravel()  
specificity_anti = tn_anti / (tn_anti + fp_anti)

# Print the results
print("Accuracy on Antisense Data:", accuracy_anti)
print("Precision on Antisense Data:", precision_anti)
print("Recall on Antisense Data:", recall_anti)
print("F1 Score on Antisense Data:", f1_score_anti)
print("MCC on Antisense Data:", mcc_anti)
print("Specificity on Antisense Data:", specificity_anti)


Accuracy on Antisense Data: 0.8375246548323472
Precision on Antisense Data: 0.8408298755186722
Recall on Antisense Data: 0.8326758711374096
F1 Score on Antisense Data: 0.8367330085060698
MCC on Antisense Data: 0.6750810536717758
Specificity on Antisense Data: 0.8423734385272846


## Internal Promoter with downstream

In [25]:
internal_d = "D:\Other promoters\Sequence 80-20\Induced"

# Initialize an empty list to store DataFrames from each file
internal_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(internal_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(internal_d, filename), "r") as file:
            internal_sequence = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        internal_sequence = internal_sequence.replace('\n', '')

        internal_data = [list(internal_sequence[i:i+num_columns]) for i in range(0, len(internal_sequence), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        internal_df = pd.DataFrame(internal_data, columns=column_names)
        
        # Append the DataFrame to the list
        internal_dfs.append(internal_df)

# Concatenate all DataFrames into one
internal_data = pd.concat(internal_dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one 
internal_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,C,T,C,G,T,A,C,G,G,G,...,A,A,G,A,A,A,A,T,C,A
1,A,T,C,A,A,A,T,T,T,A,...,C,A,G,C,T,T,A,T,G,C
2,G,G,G,C,C,A,T,G,C,A,...,A,G,C,A,A,T,T,G,C,T
3,A,C,A,A,C,T,G,A,T,T,...,A,A,A,C,G,A,A,A,T,C
4,C,T,C,T,T,T,A,A,A,T,...,A,C,A,A,G,A,A,A,A,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,G,C,T,G,G,C,C,A,C,C,...,C,A,A,G,A,A,C,G,T,C
6061,G,G,A,G,G,C,G,A,T,C,...,C,C,G,C,A,G,A,C,T,C
6062,C,T,G,G,G,C,A,T,G,T,...,C,C,G,G,G,C,G,C,G,A
6063,A,C,T,C,A,C,G,C,T,C,...,C,A,T,C,G,T,A,C,C,G


In [26]:
internal_data['label'] = 1
internal_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,C,T,C,G,T,A,C,G,G,G,...,A,G,A,A,A,A,T,C,A,1
1,A,T,C,A,A,A,T,T,T,A,...,A,G,C,T,T,A,T,G,C,1
2,G,G,G,C,C,A,T,G,C,A,...,G,C,A,A,T,T,G,C,T,1
3,A,C,A,A,C,T,G,A,T,T,...,A,A,C,G,A,A,A,T,C,1
4,C,T,C,T,T,T,A,A,A,T,...,C,A,A,G,A,A,A,A,T,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,G,C,T,G,G,C,C,A,C,C,...,A,A,G,A,A,C,G,T,C,1
6061,G,G,A,G,G,C,G,A,T,C,...,C,G,C,A,G,A,C,T,C,1
6062,C,T,G,G,G,C,A,T,G,T,...,C,G,G,G,C,G,C,G,A,1
6063,A,C,T,C,A,C,G,C,T,C,...,A,T,C,G,T,A,C,C,G,1


In [28]:
downstream_internal_d = "D:\Other promoters\Downstream sequences\Induced"

# Initialize an empty list to store DataFrames from each file
downstream_internal_dfs = []

# Loop through all .txt files in the directory
for filename in os.listdir(downstream_internal_d):
    if filename.endswith(".txt"):
        # Read the current file
        with open(os.path.join(downstream_internal_d, filename), "r") as file:
            downstream_internal_sequence = file.read()

        num_columns = 100

        # Filter out newline characters from the sequence data
        downstream_internal_sequence = downstream_internal_sequence.replace('\n', '')

        downstream_internal_data = [list(downstream_internal_sequence[i:i+num_columns]) for i in range(0, len(downstream_internal_sequence), num_columns)]

        # Create a list of column names ranging from -80 to 19
        column_names = [str(i) for i in range(-80, 20)]

        # Create a DataFrame for the current file
        downstream_internal_df = pd.DataFrame(downstream_internal_data, columns=column_names)

        # Append the DataFrame to the list
        downstream_internal_dfs.append(downstream_internal_df)

# Concatenate all DataFrames into one
downstream_internal_data = pd.concat(downstream_internal_dfs, ignore_index=True)

# Now, combined_df contains the data from all .txt files in one 
downstream_internal_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,10,11,12,13,14,15,16,17,18,19
0,A,A,A,G,A,A,G,A,G,C,...,A,T,C,A,T,T,C,T,A,T
1,C,C,A,C,A,A,C,C,G,T,...,T,C,C,C,T,G,C,C,T,T
2,C,C,G,C,T,T,T,T,T,T,...,G,A,T,C,T,T,T,A,T,G
3,T,C,T,C,T,T,T,A,C,G,...,C,C,G,A,T,T,C,A,G,A
4,A,A,T,T,G,T,T,G,A,C,...,A,T,C,T,C,G,T,A,T,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,G,G,C,C,C,A,C,G,G,C,...,C,A,A,C,T,C,C,C,G,G
6061,C,C,G,A,C,G,C,A,A,G,...,G,G,C,G,A,A,C,C,G,C
6062,A,A,G,A,A,G,A,C,C,G,...,C,A,C,G,C,C,A,C,C,A
6063,G,C,C,G,A,A,G,A,C,C,...,G,C,G,G,A,T,C,G,G,C


In [29]:
downstream_internal_data['label'] = 0
downstream_internal_data

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,A,A,A,G,A,A,G,A,G,C,...,T,C,A,T,T,C,T,A,T,0
1,C,C,A,C,A,A,C,C,G,T,...,C,C,C,T,G,C,C,T,T,0
2,C,C,G,C,T,T,T,T,T,T,...,A,T,C,T,T,T,A,T,G,0
3,T,C,T,C,T,T,T,A,C,G,...,C,G,A,T,T,C,A,G,A,0
4,A,A,T,T,G,T,T,G,A,C,...,T,C,T,C,G,T,A,T,T,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,G,G,C,C,C,A,C,G,G,C,...,A,A,C,T,C,C,C,G,G,0
6061,C,C,G,A,C,G,C,A,A,G,...,G,C,G,A,A,C,C,G,C,0
6062,A,A,G,A,A,G,A,C,C,G,...,A,C,G,C,C,A,C,C,A,0
6063,G,C,C,G,A,A,G,A,C,C,...,C,G,G,A,T,C,G,G,C,0


In [30]:
# Concatenate the two DataFrames vertically
internal = pd.concat([internal_data, downstream_internal_data], ignore_index=True)
internal

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,11,12,13,14,15,16,17,18,19,label
0,C,T,C,G,T,A,C,G,G,G,...,A,G,A,A,A,A,T,C,A,1
1,A,T,C,A,A,A,T,T,T,A,...,A,G,C,T,T,A,T,G,C,1
2,G,G,G,C,C,A,T,G,C,A,...,G,C,A,A,T,T,G,C,T,1
3,A,C,A,A,C,T,G,A,T,T,...,A,A,C,G,A,A,A,T,C,1
4,C,T,C,T,T,T,A,A,A,T,...,C,A,A,G,A,A,A,A,T,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12125,G,G,C,C,C,A,C,G,G,C,...,A,A,C,T,C,C,C,G,G,0
12126,C,C,G,A,C,G,C,A,A,G,...,G,C,G,A,A,C,C,G,C,0
12127,A,A,G,A,A,G,A,C,C,G,...,A,C,G,C,C,A,C,C,A,0
12128,G,C,C,G,A,A,G,A,C,C,...,C,G,G,A,T,C,G,G,C,0


In [31]:

X_internal = internal.drop('label', axis=1)

# Use the trained XGBoost model for prediction
y_pred_internal = pipeline.predict(X_internal)

y_true_internal = internal['label']


In [32]:
# Calculate accuracy
accuracy_internal = accuracy_score(y_true_internal, y_pred_internal)

# Calculate precision
precision_internal = precision_score(y_true_internal, y_pred_internal)

# Calculate recall
recall_internal = recall_score(y_true_internal, y_pred_internal)

# Calculate F1 score
f1_score_internal = f1_score(y_true_internal, y_pred_internal)

# Calculate MCC (Matthews Correlation Coefficient)
mcc_internal = matthews_corrcoef(y_true_internal, y_pred_internal)

# Calculate specificity
tn_internal, fp_internal, fn_internal, tp_internal = confusion_matrix(y_true_internal, y_pred_internal).ravel()
specificity_internal = tn_internal / (tn_internal + fp_internal)

# Print the results
print("Accuracy on Internal Data:", accuracy_internal)
print("Precision on Internal Data:", precision_internal)
print("Recall on Internal Data:", recall_internal)
print("F1 Score on Internal Data:", f1_score_internal)
print("MCC on Internal Data:", mcc_internal)
print("Specificity on Internal Data:", specificity_internal)


Accuracy on Internal Data: 0.8403132728771641
Precision on Internal Data: 0.8662171753016323
Recall on Internal Data: 0.8049464138499588
F1 Score on Internal Data: 0.8344585932826254
MCC on Internal Data: 0.6823356371058573
Specificity on Internal Data: 0.8756801319043693
