In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
from xgboost import XGBClassifier

def load_sequence_data(file_path, label):
    """Load sequence data from a file and label it."""
    with open(file_path, "r") as file:
        sequences = file.read().split("\n")
    # Assuming each line in the file is a sequence
    data = [list(sequence) for sequence in sequences if sequence]  # Exclude any empty lines
    df = pd.DataFrame(data)
    df['label'] = label
    return df

# Load and label positive and negative data
pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)

# Combine the data
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

# Define features and target
X = combined_data.drop('label', axis=1)
y = combined_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

# One-hot encoding and XGBoost model pipeline

pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')),  # Set handle_unknown='ignore'
    ('model', XGBClassifier(
        colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_depth=None,
        min_child_weight=5, n_estimators=300, subsample=1.0, n_jobs=-1, random_state=101
    ))
])



# Perform KFold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=101)
results = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    pipeline.fit(X_train_fold, y_train_fold)
    y_pred = pipeline.predict(X_val_fold)

    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)
    mcc = matthews_corrcoef(y_val_fold, y_pred)

    results.append([accuracy, precision, recall, f1, mcc])

# Convert results to a DataFrame for easier analysis
results_df = pd.DataFrame(results, columns=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'MCC'])

# Calculate mean and standard deviation of metrics
mean_results = results_df.mean()
std_results = results_df.std()

print("Cross-Validation Results:\n", mean_results)
print("\nStandard Deviations:\n", std_results)




Cross-Validation Results:
 Accuracy     0.761499
Precision    0.749036
Recall       0.792237
F1 Score     0.768806
MCC          0.526134
dtype: float64

Standard Deviations:
 Accuracy     0.021702
Precision    0.036836
Recall       0.039751
F1 Score     0.020419
MCC          0.043882
dtype: float64


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

def load_sequence_data(file_path, label):
    """Load sequence data from a file and label it."""
    with open(file_path, "r") as file:
        sequences = file.read().split("\n")
    data = [list(sequence) for sequence in sequences if sequence]  # Exclude any empty lines
    df = pd.DataFrame(data)
    df['label'] = label
    return df


# Load and label positive and negative data
pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)

# Combine the data
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

# Define features and target
X = combined_data.drop('label', axis=1)
y = combined_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)


classifiers = {
    'SVM': lambda: SVC(probability=True, random_state=101),
    'Random Forest': lambda: RandomForestClassifier(n_estimators=100, random_state=101),
    'Logistic Regression': lambda: LogisticRegression(random_state=101),
    'Naive Bayes': lambda: GaussianNB(),
    'K-NN': lambda: KNeighborsClassifier(n_neighbors=5),
    'GBM': lambda: GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': lambda: AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': lambda: DecisionTreeClassifier(random_state=101)
}

def evaluate_classifier(X_train, y_train, classifier_func):
    kf = KFold(n_splits=10, shuffle=True, random_state=101)
    results = []
    
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        pipeline = Pipeline([
            ('encoder', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')),
            ('model', classifier_func())
        ])
        
        pipeline.fit(X_train_fold, y_train_fold)
        y_pred = pipeline.predict(X_val_fold)
        
        metrics = {
            'Accuracy': accuracy_score(y_val_fold, y_pred),
            'Precision': precision_score(y_val_fold, y_pred),
            'Recall': recall_score(y_val_fold, y_pred),
            'F1 Score': f1_score(y_val_fold, y_pred),
            'MCC': matthews_corrcoef(y_val_fold, y_pred)
        }
        
        results.append(metrics)
    
    return results


final_results = {}

for name, classifier_func in classifiers.items():
    results = evaluate_classifier(X_train, y_train, classifier_func)
    results_df = pd.DataFrame(results)
    mean_results = results_df.mean().to_dict()
    final_results[name] = mean_results

# Print final results for all classifiers
for classifier, metrics in final_results.items():
    print(f"Results for {classifier}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print("\n")




Results for SVM:
Accuracy: 0.6528
Precision: 0.6578
Recall: 0.6441
F1 Score: 0.6500
MCC: 0.3068


Results for Random Forest:
Accuracy: 0.5883
Precision: 0.5949
Recall: 0.5675
F1 Score: 0.5797
MCC: 0.1787


Results for Logistic Regression:
Accuracy: 0.6096
Precision: 0.6136
Recall: 0.6010
F1 Score: 0.6063
MCC: 0.2206


Results for Naive Bayes:
Accuracy: 0.6524
Precision: 0.6586
Recall: 0.6383
F1 Score: 0.6476
MCC: 0.3057


Results for K-NN:
Accuracy: 0.5846
Precision: 0.6146
Recall: 0.4648
F1 Score: 0.5279
MCC: 0.1763


Results for GBM:
Accuracy: 0.6245
Precision: 0.6302
Recall: 0.6075
F1 Score: 0.6180
MCC: 0.2497


Results for AdaBoost:
Accuracy: 0.6042
Precision: 0.6050
Recall: 0.6007
F1 Score: 0.6021
MCC: 0.2088


Results for Decision Tree:
Accuracy: 0.5245
Precision: 0.5264
Recall: 0.5177
F1 Score: 0.5209
MCC: 0.0505




In [3]:
!pip install catboost lightgbm

Collecting catboost
  Downloading catboost-1.2.3-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting lightgbm
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.3-cp311-cp311-win_amd64.whl (101.1 MB)
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB 991.0 kB/s eta 0:01:42
   ---------------------------------------- 0.1/101.1 MB 1.7 MB/s eta 0:01:02
   ---------------------------------------- 0.2/101.1 MB 1.4 MB/s eta 0:01:12
   ---------------------------------------- 0.2/101.1 MB 1.3 MB/s eta 0:01:21
   ---------------------------------------- 0.3/101.1 MB 1.2 MB/s eta 0:01:24
   ---------------------------------------- 0.4/101.1 MB 1.4 MB/s eta 0:01:13
   ---------------------------------------- 0.4/101.1 MB 1.4 MB/s eta 0:01:13
   --------------------------------

In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Define the function to load and preprocess the data
def load_sequence_data(file_path, label):
    """Load sequence data from a file and label it."""
    with open(file_path, "r") as file:
        sequences = file.read().split("\n")
    data = [list(sequence) for sequence in sequences if sequence]  # Exclude any empty lines
    df = pd.DataFrame(data)
    df['label'] = label
    return df


# Load and label positive and negative data
pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)

# Combine the data
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

# Define features and target
X = combined_data.drop('label', axis=1)
y = combined_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)


# Define classifiers in a dictionary
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    'Logistic Regression': LogisticRegression(random_state=101),
    'Gaussian Naive Bayes': GaussianNB(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Ridge Classifier': RidgeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD Classifier': SGDClassifier(random_state=101),
    'Bagging Classifier': BaggingClassifier(base_estimator=ExtraTreeClassifier(random_state=101), n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(random_state=101),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}


def evaluate_classifier(X_train, y_train, classifier_name, classifier):
    kf = KFold(n_splits=10, shuffle=True, random_state=101)
    results = []
    
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        if classifier_name == 'Multinomial Naive Bayes':
            encoder = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
            X_train_fold_enc = encoder.fit_transform(X_train_fold)
            X_val_fold_enc = encoder.transform(X_val_fold)
            classifier.fit(X_train_fold_enc, y_train_fold)
            y_pred = classifier.predict(X_val_fold_enc)
        else:
            pipeline = Pipeline([
                ('encoder', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')),
                ('model', classifier)
            ])
            pipeline.fit(X_train_fold, y_train_fold)
            y_pred = pipeline.predict(X_val_fold)
        
        metrics = {
            'Accuracy': accuracy_score(y_val_fold, y_pred),
            'Precision': precision_score(y_val_fold, y_pred, zero_division=0),
            'Recall': recall_score(y_val_fold, y_pred, zero_division=0),
            'F1 Score': f1_score(y_val_fold, y_pred, zero_division=0),
            'MCC': matthews_corrcoef(y_val_fold, y_pred)
        }
        
        results.append(metrics)
    
    return results

# Execute evaluation for each classifier and print results
for name, clf in classifiers.items():
    print(f"Evaluating {name}...")
    results = evaluate_classifier(X_train, y_train, name, clf)
    results_df = pd.DataFrame(results)
    mean_results = results_df.mean()
    print(f"Results for {name}:")
    for metric, value in mean_results.items():
        print(f"{metric}: {value:.4f}")
    print("\n")


Evaluating SVM...




Results for SVM:
Accuracy: 0.6528
Precision: 0.6578
Recall: 0.6441
F1 Score: 0.6500
MCC: 0.3068


Evaluating Random Forest...




Results for Random Forest:
Accuracy: 0.5883
Precision: 0.5949
Recall: 0.5675
F1 Score: 0.5797
MCC: 0.1787


Evaluating Logistic Regression...




Results for Logistic Regression:
Accuracy: 0.6096
Precision: 0.6136
Recall: 0.6010
F1 Score: 0.6063
MCC: 0.2206


Evaluating Gaussian Naive Bayes...




Results for Gaussian Naive Bayes:
Accuracy: 0.6524
Precision: 0.6586
Recall: 0.6383
F1 Score: 0.6476
MCC: 0.3057


Evaluating Multinomial Naive Bayes...




Results for Multinomial Naive Bayes:
Accuracy: 0.6546
Precision: 0.6628
Recall: 0.6355
F1 Score: 0.6480
MCC: 0.3104


Evaluating K-NN...




Results for K-NN:
Accuracy: 0.5846
Precision: 0.6146
Recall: 0.4648
F1 Score: 0.5279
MCC: 0.1763


Evaluating Gradient Boosting...




Results for Gradient Boosting:
Accuracy: 0.6245
Precision: 0.6302
Recall: 0.6075
F1 Score: 0.6180
MCC: 0.2497


Evaluating AdaBoost...




Results for AdaBoost:
Accuracy: 0.6042
Precision: 0.6050
Recall: 0.6007
F1 Score: 0.6021
MCC: 0.2088


Evaluating Decision Tree...




Results for Decision Tree:
Accuracy: 0.5245
Precision: 0.5264
Recall: 0.5177
F1 Score: 0.5209
MCC: 0.0505


Evaluating Linear Discriminant Analysis...




Results for Linear Discriminant Analysis:
Accuracy: 0.6096
Precision: 0.6119
Recall: 0.6053
F1 Score: 0.6078
MCC: 0.2203


Evaluating Quadratic Discriminant Analysis...




Results for Quadratic Discriminant Analysis:
Accuracy: 0.5187
Precision: 0.5280
Recall: 0.6466
F1 Score: 0.5512
MCC: 0.0434


Evaluating Ridge Classifier...




Results for Ridge Classifier:
Accuracy: 0.6104
Precision: 0.6128
Recall: 0.6053
F1 Score: 0.6082
MCC: 0.2218


Evaluating Perceptron...




Results for Perceptron:
Accuracy: 0.5846
Precision: 0.5869
Recall: 0.6380
F1 Score: 0.5918
MCC: 0.1798


Evaluating SGD Classifier...




Results for SGD Classifier:
Accuracy: 0.6006
Precision: 0.6044
Recall: 0.5921
F1 Score: 0.5943
MCC: 0.2029


Evaluating Bagging Classifier...




Results for Bagging Classifier:
Accuracy: 0.5890
Precision: 0.5994
Recall: 0.5467
F1 Score: 0.5702
MCC: 0.1800


Evaluating Extra Trees...




Results for Extra Trees:
Accuracy: 0.6165
Precision: 0.6269
Recall: 0.5865
F1 Score: 0.6048
MCC: 0.2357


Evaluating CatBoost...




Results for CatBoost:
Accuracy: 0.6292
Precision: 0.6332
Recall: 0.6221
F1 Score: 0.6267
MCC: 0.2595


Evaluating LightGBM...




[LightGBM] [Info] Number of positive: 1244, number of negative: 1239
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501007 -> initscore=0.004027
[LightGBM] [Info] Start training from score 0.004027




[LightGBM] [Info] Number of positive: 1245, number of negative: 1238
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501410 -> initscore=0.005638
[LightGBM] [Info] Start training from score 0.005638




[LightGBM] [Info] Number of positive: 1244, number of negative: 1239
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024510 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501007 -> initscore=0.004027
[LightGBM] [Info] Start training from score 0.004027




[LightGBM] [Info] Number of positive: 1243, number of negative: 1240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500604 -> initscore=0.002416
[LightGBM] [Info] Start training from score 0.002416




[LightGBM] [Info] Number of positive: 1240, number of negative: 1243
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499396 -> initscore=-0.002416
[LightGBM] [Info] Start training from score -0.002416




[LightGBM] [Info] Number of positive: 1257, number of negative: 1226
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506242 -> initscore=0.024971
[LightGBM] [Info] Start training from score 0.024971




[LightGBM] [Info] Number of positive: 1243, number of negative: 1240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500604 -> initscore=0.002416
[LightGBM] [Info] Start training from score 0.002416




[LightGBM] [Info] Number of positive: 1249, number of negative: 1234
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503021 -> initscore=0.012082
[LightGBM] [Info] Start training from score 0.012082




[LightGBM] [Info] Number of positive: 1234, number of negative: 1249
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496979 -> initscore=-0.012082
[LightGBM] [Info] Start training from score -0.012082




[LightGBM] [Info] Number of positive: 1248, number of negative: 1236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2484, number of used features: 804
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502415 -> initscore=0.009662
[LightGBM] [Info] Start training from score 0.009662
Results for LightGBM:
Accuracy: 0.6241
Precision: 0.6274
Recall: 0.6209
F1 Score: 0.6230
MCC: 0.2494


Evaluating XGBoost...




Results for XGBoost:
Accuracy: 0.5999
Precision: 0.6036
Recall: 0.5945
F1 Score: 0.5976
MCC: 0.2013




In [6]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def load_sequence_data(file_path, label):
    with open(file_path, "r") as file:
        sequences = file.read().split("\n")
    data = [list(sequence) for sequence in sequences if sequence]
    df = pd.DataFrame(data)
    df['label'] = label
    return df

pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

encoder = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
X_encoded = encoder.fit_transform(combined_data.drop('label', axis=1))
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.1, random_state=101)

classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    'Logistic Regression': LogisticRegression(random_state=101),
    'Gaussian Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Ridge Classifier': RidgeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD Classifier': SGDClassifier(random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(random_state=101),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Results for {name}:")
    print(f"    Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"    Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}\n")

voting_clf = VotingClassifier(estimators=[(name, clf) for name, clf in classifiers.items()], voting='hard')
voting_clf.fit(X_train, y_train)
y_pred_vote = voting_clf.predict(X_test)

print("Voting Classifier Performance:")
print(f"    Accuracy: {accuracy_score(y_test, y_pred_vote):.4f}")
print(f"    Precision: {precision_score(y_test, y_pred_vote, zero_division=0):.4f}")
print(f"    Recall: {recall_score(y_test, y_pred_vote, zero_division=0):.4f}")
print(f"    F1 Score: {f1_score(y_test, y_pred_vote, zero_division=0):.4f}")


Results for SVM:
    Accuracy: 0.7622
    Precision: 0.7391
    Recall: 0.7933
    F1 Score: 0.7653

Results for Random Forest:
    Accuracy: 0.7134
    Precision: 0.6914
    Recall: 0.7467
    F1 Score: 0.7179



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Logistic Regression:
    Accuracy: 0.7231
    Precision: 0.7211
    Recall: 0.7067
    F1 Score: 0.7138

Results for Gaussian Naive Bayes:
    Accuracy: 0.7980
    Precision: 0.7785
    Recall: 0.8200
    F1 Score: 0.7987

Results for K-NN:
    Accuracy: 0.6840
    Precision: 0.8046
    Recall: 0.4667
    F1 Score: 0.5907

Results for Gradient Boosting:
    Accuracy: 0.7068
    Precision: 0.6765
    Recall: 0.7667
    F1 Score: 0.7188

Results for AdaBoost:
    Accuracy: 0.7622
    Precision: 0.7452
    Recall: 0.7800
    F1 Score: 0.7622

Results for Decision Tree:
    Accuracy: 0.5798
    Precision: 0.5652
    Recall: 0.6067
    F1 Score: 0.5852

Results for Linear Discriminant Analysis:
    Accuracy: 0.6743
    Precision: 0.6623
    Recall: 0.6800
    F1 Score: 0.6711





Results for Quadratic Discriminant Analysis:
    Accuracy: 0.5212
    Precision: 0.5283
    Recall: 0.1867
    F1 Score: 0.2759

Results for Ridge Classifier:
    Accuracy: 0.6743
    Precision: 0.6623
    Recall: 0.6800
    F1 Score: 0.6711

Results for Perceptron:
    Accuracy: 0.7134
    Precision: 0.7246
    Recall: 0.6667
    F1 Score: 0.6944

Results for SGD Classifier:
    Accuracy: 0.7134
    Precision: 0.7153
    Recall: 0.6867
    F1 Score: 0.7007

Results for CatBoost:
    Accuracy: 0.7622
    Precision: 0.7305
    Recall: 0.8133
    F1 Score: 0.7697

[LightGBM] [Info] Number of positive: 1383, number of negative: 1376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4008
[LightGBM] [Info] Number of data points in the train set: 2759, number of used features: 2004
[LightGB

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 1383, number of negative: 1376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4008
[LightGBM] [Info] Number of data points in the train set: 2759, number of used features: 2004
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501269 -> initscore=0.005074
[LightGBM] [Info] Start training from score 0.005074
Voting Classifier Performance:
    Accuracy: 0.7524
    Precision: 0.7569
    Recall: 0.7267
    F1 Score: 0.7415


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def load_sequence_data(file_path, label):
    """Load sequence data from a file and label it."""
    with open(file_path, "r") as file:
        sequences = file.read().split("\n")
    data = [list(sequence) for sequence in sequences if sequence]  # Exclude empty lines
    df = pd.DataFrame(data)
    df['label'] = label
    return df

# Load and label the data
pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

# Define features and target
X = combined_data.drop('label', axis=1)
y = combined_data['label']

# Convert sequences to integers
label_encoder = LabelEncoder()
X_encoded = X.applymap(lambda x: label_encoder.fit_transform([x])[0])

# One-hot encoding
onehot_encoder = OneHotEncoder(sparse=False, dtype=int)
X_onehot = onehot_encoder.fit_transform(X_encoded)

# Reshape data for Conv1D layer
sequence_length = X_onehot.shape[1]
X_reshaped = X_onehot.reshape((X_onehot.shape[0], sequence_length, 1))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.1, random_state=101)

# Define the CNN model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(sequence_length, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_split=0.1)  # Adjust epochs, add callbacks as needed

# Evaluate the model
accuracy = model.evaluate(X_test, y_test)[1]
print(f'Test Accuracy: {accuracy*100:.2f}%')


Epoch 1/10


  super().__init__(


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5115 - loss: 0.6977 - val_accuracy: 0.5781 - val_loss: 0.6922
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4995 - loss: 0.7017 - val_accuracy: 0.4219 - val_loss: 0.7204
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4806 - loss: 0.7145 - val_accuracy: 0.4219 - val_loss: 0.6977
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4724 - loss: 0.7021 - val_accuracy: 0.5781 - val_loss: 0.6824
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5367 - loss: 0.6945 - val_accuracy: 0.4219 - val_loss: 0.7031
Epoch 6/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5247 - loss: 0.6992 - val_accuracy: 0.4219 - val_loss: 0.6946
Epoch 7/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━

In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, BatchNormalization, MaxPooling1D, LSTM, Bidirectional, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

def load_sequence_data(file_path, label):
    with open(file_path, "r") as file:
        sequences = file.read().strip().split("\n")
    data = {'sequence': sequences, 'label': [label] * len(sequences)}
    return pd.DataFrame(data)

def preprocess_sequences(sequences, padding_value='N'):
    all_characters = [char for seq in sequences for char in seq] + [padding_value]
    label_encoder = LabelEncoder()
    label_encoder.fit(all_characters)
    encoded_seqs = [label_encoder.transform(list(seq)) for seq in sequences]
    padded_seqs = pad_sequences(encoded_seqs, padding='post', value=label_encoder.transform([padding_value])[0])
    return padded_seqs

pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

max_seq_length = combined_data['sequence'].apply(len).max()
X = preprocess_sequences(combined_data['sequence'], padding_value='N')
y = combined_data['label'].values

seq_length = X.shape[1]
vocab_size = len(np.unique(X)) + 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def get_compiled_model(seq_length, vocab_size):
    input_layer = Input(shape=(seq_length,))
    x = Embedding(input_dim=vocab_size, output_dim=50)(input_layer)
    x = Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.5)(x)
    x = Bidirectional(LSTM(64))(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = get_compiled_model(seq_length, vocab_size)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

model.fit(X_train, y_train, validation_split=0.2, epochs=200, batch_size=32, callbacks=[early_stopping, model_checkpoint], verbose=1)

model.load_weights('best_model.keras')
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy*100:.2f}%')


Epoch 1/200
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.5279 - loss: 0.6976
Epoch 1: val_accuracy improved from -inf to 0.59130, saving model to best_model.keras
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 75ms/step - accuracy: 0.5277 - loss: 0.6975 - val_accuracy: 0.5913 - val_loss: 0.6926
Epoch 2/200
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.5255 - loss: 0.6829
Epoch 2: val_accuracy did not improve from 0.59130
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.5271 - loss: 0.6828 - val_accuracy: 0.4870 - val_loss: 0.6929
Epoch 3/200
[1m13/15[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 33ms/step - accuracy: 0.6277 - loss: 0.6443
Epoch 3: val_accuracy did not improve from 0.59130
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.6268 - loss: 0.6462 - val_accuracy: 0.4783 - val_loss: 0.6927
Ep

In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, BatchNormalization, MaxPooling1D, LSTM, Bidirectional, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

def load_sequence_data(file_path, label):
    """Load sequence data from a file and assign labels."""
    with open(file_path, "r") as file:
        sequences = file.read().strip().split("\n")
    data = {'sequence': sequences, 'label': [label] * len(sequences)}
    return pd.DataFrame(data)

def preprocess_sequences(sequences, padding_value='N'):
    """Encode sequences using LabelEncoder and pad them."""
    label_encoder = LabelEncoder().fit(list(set(''.join(sequences) + padding_value)))
    encoded_seqs = [label_encoder.transform(list(seq)) for seq in sequences]
    padded_seqs = pad_sequences(encoded_seqs, padding='post', value=label_encoder.transform([padding_value])[0])
    return padded_seqs

# Load and combine data
pos_data = load_sequence_data("oripos.txt", 1)
neg_data = load_sequence_data("orineg.txt", 0)
combined_data = pd.concat([pos_data, neg_data], ignore_index=True)

# Preprocess sequences
max_seq_length = combined_data['sequence'].apply(len).max()
X = preprocess_sequences(combined_data['sequence'], padding_value='N')
y = combined_data['label'].values

# Prepare data for training
seq_length = X.shape[1]
vocab_size = np.max(X) + 1  # Updated to directly compute the vocab size

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def get_compiled_model(seq_length, vocab_size):
    """Define and compile the model."""
    input_layer = Input(shape=(seq_length,))
    x = Embedding(input_dim=vocab_size, output_dim=50)(input_layer)
    x = Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.5)(x)
    x = Bidirectional(LSTM(64))(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = get_compiled_model(seq_length, vocab_size)

# Setup callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

# Train the model
model.fit(X_train, y_train, validation_split=0.2, epochs=200, batch_size=32, callbacks=[early_stopping, model_checkpoint], verbose=1)

# Evaluate the model
model.load_weights('best_model.keras')
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy*100:.2f}%')


Epoch 1/200
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.4725 - loss: 0.6987
Epoch 1: val_accuracy improved from -inf to 0.48696, saving model to best_model.keras
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 82ms/step - accuracy: 0.4750 - loss: 0.6985 - val_accuracy: 0.4870 - val_loss: 0.6929
Epoch 2/200
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5635 - loss: 0.6675
Epoch 2: val_accuracy did not improve from 0.48696
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.5647 - loss: 0.6674 - val_accuracy: 0.4783 - val_loss: 0.6910
Epoch 3/200
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.6132 - loss: 0.6458
Epoch 3: val_accuracy improved from 0.48696 to 0.52174, saving model to best_model.keras
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.6135 - loss: 0.6453 - val_

Epoch 26/200
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.8423 - loss: 0.3440
Epoch 26: val_accuracy did not improve from 0.61739
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.8405 - loss: 0.3469 - val_accuracy: 0.5478 - val_loss: 0.8170
Epoch 26: early stopping
Restoring model weights from the end of the best epoch: 16.
Test Accuracy: 65.73%


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, BatchNormalization, MaxPooling1D, LSTM, Bidirectional, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

def load_and_process_sequences(file_path):
    with open(file_path, "r") as file:
        sequences = file.read().strip().split("\n")
    
    pos_data = [seq[5000:5301] for seq in sequences]
    neg_data = [seq[8000:8301] for seq in sequences]  # Use only the 8000-8201 region for negative data
    
    combined_data = [{'sequence': seq, 'label': 1} for seq in pos_data] + [{'sequence': seq, 'label': 0} for seq in neg_data]
    return pd.DataFrame(combined_data)

def preprocess_sequences(sequences):
    label_encoder = LabelEncoder().fit(list(set(''.join(sequences))))
    encoded_seqs = [label_encoder.transform(list(seq)) for seq in sequences]
    return np.array(encoded_seqs)

def get_compiled_model(seq_length, vocab_size):
    input_layer = Input(shape=(seq_length,))
    x = Embedding(input_dim=vocab_size, output_dim=50)(input_layer)
    
    # Deepen the network with additional layers
    for _ in range(3):  # Example to add 3 sets of Conv, BatchNorm, MaxPool, Dropout
        x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(x)
        x = BatchNormalization()(x)
        x = MaxPooling1D(pool_size=2)(x)
        x = Dropout(0.5)(x)
    
    x = Bidirectional(LSTM(128))(x)  # Increased LSTM units
    x = Dense(256, activation='relu')(x)  # Increased Dense layer units
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model


files = ["Arabidopsis_correct_10001"]
#files = ["Arabidopsis_correct_10001", "Candiada_correct_10001", "DM3_10001", 
#         "Kluy_correct_10001", "MM8_correct_10001", "Pichia_correct_10001",
#         "Sacharo_correct_10001", "Schizo_correct_10001"]

for file in files:
    combined_data = load_and_process_sequences(f"{file}")
    X = preprocess_sequences(combined_data['sequence'])
    y = combined_data['label'].values

    seq_length = X.shape[1]
    vocab_size = len(np.unique(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = get_compiled_model(seq_length, vocab_size)

    model_checkpoint = ModelCheckpoint(f'best_model_{file}.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

    model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[model_checkpoint], verbose=1)

    model.load_weights(f'best_model_{file}.keras')
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f'{file} - Test Accuracy: {accuracy*100:.2f}%')


Epoch 1/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.5789 - loss: 0.6647
Epoch 1: val_accuracy improved from -inf to 0.51120, saving model to best_model_Arabidopsis_correct_10001.keras
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 196ms/step - accuracy: 0.5798 - loss: 0.6641 - val_accuracy: 0.5112 - val_loss: 0.7029
Epoch 2/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.7072 - loss: 0.5940
Epoch 2: val_accuracy did not improve from 0.51120
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 137ms/step - accuracy: 0.7071 - loss: 0.5939 - val_accuracy: 0.5112 - val_loss: 0.9496
Epoch 3/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 0.7001 - loss: 0.5779
Epoch 3: val_accuracy did not improve from 0.51120
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 138ms/step - accuracy: 0.7000 - loss: 0.5780 - val_accuracy

Epoch 25/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.7657 - loss: 0.4989
Epoch 25: val_accuracy did not improve from 0.71487
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 139ms/step - accuracy: 0.7654 - loss: 0.4991 - val_accuracy: 0.7149 - val_loss: 0.6109
Epoch 26/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.7453 - loss: 0.5224
Epoch 26: val_accuracy did not improve from 0.71487
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 135ms/step - accuracy: 0.7454 - loss: 0.5223 - val_accuracy: 0.6864 - val_loss: 0.6531
Epoch 27/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - accuracy: 0.7551 - loss: 0.4911
Epoch 27: val_accuracy did not improve from 0.71487
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 163ms/step - accuracy: 0.7551 - loss: 0.4912 - val_accuracy: 0.6884 - val_loss: 0.6103
Epoch 28/50
[1m62/62[0m 

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.8378 - loss: 0.3612
Epoch 50: val_accuracy did not improve from 0.72505
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 138ms/step - accuracy: 0.8378 - loss: 0.3613 - val_accuracy: 0.6945 - val_loss: 0.7050
Arabidopsis_correct_10001 - Test Accuracy: 68.40%


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from concurrent.futures import ThreadPoolExecutor

def load_and_process_sequence(file_name):
    with open(file_name, "r") as file:
        sequences = file.read().strip().split("\n")

    pos_data = [seq[4900:5101] for seq in sequences]
    neg_data = [seq[8000:8201] for seq in sequences]

    combined_data = [{'sequence': seq, 'label': 1} for seq in pos_data] + [{'sequence': seq, 'label': 0} for seq in neg_data]
    return pd.DataFrame(combined_data)

def one_hot_encode_sequences_optimized(sequences):
    # Create an array of the sequences
    seq_array = np.array(sequences)

    # Define a mapping for each character to an integer
    char_to_int = np.array(['A', 'C', 'G', 'T'])

    # Initialize the encoded array
    encoded_seqs = np.zeros((seq_array.shape[0], seq_array.shape[1], 4))

    # Iterate through each character and one-hot encode
    for i, char in enumerate(char_to_int):
        encoded_seqs[seq_array == char, i] = 1

    return encoded_seqs

def get_compiled_model(seq_length):
    input_layer = tf.keras.layers.Input(shape=(seq_length, 4), name='input')
    
    # Convolutional and Pooling layers
    x = tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(input_layer)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.AvgPool1D(pool_size=2)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.AvgPool1D(pool_size=2)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.AvgPool1D(pool_size=2)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv1D(filters=256, kernel_size=5, activation=tf.nn.relu6, strides=1, kernel_regularizer=tf.keras.regularizers.L1L2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.AvgPool1D(pool_size=2)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    
    # LSTM layer
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Reshape((1, -1))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=1024, activation='relu', return_sequences=False))(x)
    
    # Dense layers
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(units=1024, activation='relu')(x)
    output_layer = tf.keras.layers.Dense(2, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    return model

def load_and_process_sequences_parallel(files):
    with ThreadPoolExecutor() as executor:
        combined_data = list(executor.map(load_and_process_sequence, files))
    return pd.concat(combined_data, ignore_index=True)

# Sample file names for demonstration (these need to be actual file paths in practice)
files = ["file1", "file2", "file3"]

# Load and process sequences in parallel
combined_data = load_and_process_sequences_parallel(files)

# One-hot encode sequences
X = one_hot_encode_sequences_optimized(combined_data['sequence'])

# Convert labels to categorical
y = to_categorical(combined_data['label'].values)

# Get sequence length
seq_length = X.shape[1]

# Compile the model
model = get_compiled_model(seq_length)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy*100:.2f}%')

# Note: Actual file paths need to be used in place of "file1", "file2", "file3" for this code to run successfully.
