In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Function to transform sequence data based on the encoding scheme
def transform_sequence(sequence, encoding):
    numerical_data = []
    for i in range(0, len(sequence), 2):  # Assuming the sequences can be divided into di-nucleotides
        di_nucleotide = sequence[i:i+2]
        numerical_data.append(encoding.get(di_nucleotide, 0))  # Default 0 for unknown di-nucleotides
    
    # Split the numerical data into non-overlapping windows of size 10 and calculate the average
    window_avgs = [np.mean(numerical_data[i:i+10]) for i in range(0, len(numerical_data), 10)]
    return window_avgs

# Encoding scheme
encoding = {
    'AA': 0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
    'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
    'GA': -1.30, 'GC': -2.24, 'GG': -20, 'GT': -1.44, 
    'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': 0
}

# Function to load and transform data
def load_and_transform_data(pos_file_path, neg_file_path, encoding):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    # Handle varying window counts by padding with NaN and later imputing or dropping
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

# Load and transform data
X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)



classifiers = {
    # Excluded LDA and QDA due to collinearity warning
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    'Logistic Regression': LogisticRegression(random_state=101),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(random_state=101),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}

# Train and evaluate classifiers
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Results for {name}:")
    print(f"    Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"    Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}\n")

# Evaluation and Voting Classifier setup (using hard voting to avoid the AttributeError)
voting_clf = VotingClassifier(estimators=[(name, clf) for name, clf in classifiers.items()], voting='hard')
voting_clf.fit(X_train, y_train)
y_pred_vote = voting_clf.predict(X_test)



print("Voting Classifier Performance:")
print(f"    Accuracy: {accuracy_score(y_test, y_pred_vote):.4f}")
print(f"    Precision: {precision_score(y_test, y_pred_vote, zero_division=0):.4f}")
print(f"    Recall: {recall_score(y_test, y_pred_vote, zero_division=0):.4f}")
print(f"    F1 Score: {f1_score(y_test, y_pred_vote, zero_division=0):.4f}")



Results for SVM:
    Accuracy: 0.6840
    Precision: 0.6568
    Recall: 0.7400
    F1 Score: 0.6959

Results for Random Forest:
    Accuracy: 0.7785
    Precision: 0.7562
    Recall: 0.8067
    F1 Score: 0.7806

Results for Logistic Regression:
    Accuracy: 0.7231
    Precision: 0.7044
    Recall: 0.7467
    F1 Score: 0.7249

Results for Naive Bayes:
    Accuracy: 0.6743
    Precision: 0.6667
    Recall: 0.6667
    F1 Score: 0.6667

Results for K-NN:
    Accuracy: 0.6319
    Precision: 0.6022
    Recall: 0.7267
    F1 Score: 0.6586

Results for Gradient Boosting:
    Accuracy: 0.7785
    Precision: 0.7470
    Recall: 0.8267
    F1 Score: 0.7848

Results for AdaBoost:
    Accuracy: 0.7622
    Precision: 0.7516
    Recall: 0.7667
    F1 Score: 0.7591

Results for Decision Tree:
    Accuracy: 0.6319
    Precision: 0.6242
    Recall: 0.6200
    F1 Score: 0.6221

Results for Perceptron:
    Accuracy: 0.6417
    Precision: 0.6220
    Recall: 0.6800
    F1 Score: 0.6497

Results for SGD:
   

KeyboardInterrupt: 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def transform_sequence(sequence, encoding, window_size=None):
    # Convert sequence to numerical data based on encoding
    numerical_data = [encoding.get(sequence[i:i+2], 0) for i in range(0, len(sequence), 2)]
    
    # If window_size is None or greater than sequence length, return the mean of the entire sequence
    if window_size is None or window_size >= len(numerical_data):
        return [np.mean(numerical_data)]
    
    # Otherwise, calculate window averages
    return [np.mean(numerical_data[i:i+window_size]) for i in range(0, len(numerical_data), window_size)]

def load_and_transform_data(pos_file_path, neg_file_path, encoding, window_size=None):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding, window_size) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    
    # Ensure uniform feature shape by padding with NaN and later filling with 0
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

# Encoding scheme
encoding = {
    'AA': -1.0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
    'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
    'GA': -1.30, 'GC': -2.24, 'GG': -1.84, 'GT': -1.44, 
    'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': -1.0
}

window_sizes = [None, 5, 10, 15, 20]  # Define desired window sizes including 'None' for no window
results = pd.DataFrame()

for window_size in window_sizes:
    X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding, window_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Use a simple classifier for demonstration
    clf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Store results
    window_desc = 'No window' if window_size is None else f'Window size {window_size}'
    results[window_desc] = [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred, zero_division=0),
        recall_score(y_test, y_pred, zero_division=0),
        f1_score(y_test, y_pred, zero_division=0)
    ]

results.index = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
results.to_excel('classification_results.xlsx')

# Print out to confirm completion
print("Results saved to 'classification_results.xlsx'")


Results saved to 'classification_results.xlsx'


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             matthews_corrcoef, roc_auc_score, log_loss)
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def transform_sequence(sequence, encoding, window_size=None):
    numerical_data = [encoding.get(sequence[i:i+2], 0) for i in range(0, len(sequence), 2)]
    if window_size is None or window_size >= len(numerical_data):
        return [np.mean(numerical_data)]
    return [np.mean(numerical_data[i:i+window_size]) for i in range(0, len(numerical_data), window_size)]

def load_and_transform_data(pos_file_path, neg_file_path, encoding, window_size=None):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding, window_size) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

encoding = {
    'AA': -1.0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
    'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
    'GA': -1.30, 'GC': -2.24, 'GG': -1.84, 'GT': -1.44, 
    'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': -1.0
}

window_sizes = [None, 5, 10, 15, 20]
results = pd.DataFrame()

classifiers = {
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Perceptron': Perceptron(random_state=42),
    'SGD': SGDClassifier(random_state=42),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for window_size in window_sizes:
    X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding, window_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for name, clf in classifiers.items():
        pipeline = make_pipeline(StandardScaler(), clf)
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else [0.5] * len(y_pred)
        
        window_desc = f"{name} - {'No window' if window_size is None else f'Window {window_size}'}"
        results[window_desc] = [
            accuracy_score(y_test, y_pred),
            precision_score(y_test, y_pred, zero_division=0),
            recall_score(y_test, y_pred, zero_division=0),
            f1_score(y_test, y_pred, zero_division=0),
            matthews_corrcoef(y_test, y_pred),
            roc_auc_score(y_test, y_pred_proba) if hasattr(clf, "predict_proba") else np.nan,
            log_loss(y_test, y_pred_proba) if hasattr(clf, "predict_proba") else np.nan
        ]

results.index = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'MCC', 'ROC AUC', 'Log Loss']
results.to_excel('classification_results_with_windows.xlsx')

print("Results saved to 'classification_results_with_windows.xlsx'")


[LightGBM] [Info] Number of positive: 1215, number of negative: 1237
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2452, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495514 -> initscore=-0.017945
[LightGBM] [Info] Start training from score -0.017945
[LightGBM] [Info] Number of positive: 1215, number of negative: 1237
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12646
[LightGBM] [Info] Number of data points in the train set: 2452, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495514 -> initscore=-0.017945
[LightGBM] [Info] Start training from score -0.017945
[LightGBM] [Info] N

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def transform_sequence(sequence, encoding, window_size=None):
    numerical_data = [encoding.get(sequence[i:i+2], 0) for i in range(0, len(sequence), 2)]
    if window_size is None or window_size >= len(numerical_data):
        return [np.mean(numerical_data)]
    return [np.mean(numerical_data[i:i+window_size]) for i in range(0, len(numerical_data), window_size)]

def load_and_transform_data(pos_file_path, neg_file_path, encoding, window_size=None):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding, window_size) for seq in sequences if seq]
        df = pd.DataFrame(transformed_data)
        df['label'] = label
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

encoding = {
    'AA': -1.0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
    'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
    'GA': -1.30, 'GC': -2.24, 'GG': -1.84, 'GT': -1.44, 
    'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': -1.0
}


window_sizes = [None, 5, 10, 15, 20]
results = []

for window_size in window_sizes:
    X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding, window_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    window_desc = 'No window' if window_size is None else f'Window size {window_size}'
    results.append({
        'Window': window_desc,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1 Score': f1_score(y_test, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred_proba),
        'Log Loss': log_loss(y_test, y_pred_proba)
    })

results_df = pd.DataFrame(results)
results_df.to_csv('classification_results.csv', index=False)

print("Results saved to 'classification_results.csv'")


Results saved to 'classification_results.csv'


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             matthews_corrcoef, roc_auc_score)
from sklearn.ensemble import (VotingClassifier, RandomForestClassifier, GradientBoostingClassifier,
                              AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def transform_sequence(sequence, encoding):
    numerical_data = [encoding.get(sequence[i:i+2], 0) for i in range(0, len(sequence), 2)]
    window_size = 2  # Fixed window size
    # Calculate the average for non-overlapping windows of size 15
    window_avgs = [np.mean(numerical_data[i:i+window_size]) for i in range(0, len(numerical_data), window_size)]
    return window_avgs

encoding = {
    'AA': -1.0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
    'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
    'GA': -1.30, 'GC': -2.24, 'GG': -1.84, 'GT': -1.44, 
    'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': -1.0
}


def load_and_transform_data(pos_file_path, neg_file_path, encoding):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

classifiers = {
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Perceptron': Perceptron(random_state=42),
    'SGD': SGDClassifier(random_state=42),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# At the beginning, ensure results is a DataFrame
results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'MCC', 'AUC'])

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    metrics = {
        'Classifier': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1 Score': f1_score(y_test, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }
    # Calculate AUC for classifiers that have predict_proba
    if hasattr(clf, "predict_proba"):
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
        metrics['AUC'] = roc_auc_score(y_test, y_pred_proba)
    else:
        metrics['AUC'] = 'N/A'
    
    # Temporarily convert metrics to DataFrame for appending
    temp_df = pd.DataFrame([metrics])
    results = pd.concat([results, temp_df], ignore_index=True)

results.to_csv('classification_results_with_metrics.csv', index=False)
print("Results saved to 'classification_results_with_metrics.csv'")



  results = pd.concat([results, temp_df], ignore_index=True)


[LightGBM] [Info] Number of positive: 1383, number of negative: 1376
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6217
[LightGBM] [Info] Number of data points in the train set: 2759, number of used features: 125
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501269 -> initscore=0.005074
[LightGBM] [Info] Start training from score 0.005074
Results saved to 'classification_results_with_metrics.csv'


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Function to transform sequence data based on the encoding scheme
def transform_sequence(sequence, encoding):
    # Convert sequence to numerical data based on encoding
    numerical_data = [encoding.get(sequence[i:i+2], 0) for i in range(0, len(sequence), 2)]
    window_size = 15  # Fixed window size
    # Calculate the average for non-overlapping windows of size 15
    window_avgs = [np.mean(numerical_data[i:i+window_size]) for i in range(0, len(numerical_data), window_size)]
    return window_avgs

encoding = {
    'AA': -1.0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
    'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
    'GA': -1.30, 'GC': -2.24, 'GG': -1.84, 'GT': -1.44, 
    'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': -1.0
}
# Function to load and transform data
def load_and_transform_data(pos_file_path, neg_file_path, encoding):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    # Handle varying window counts by padding with NaN and later filling with 0
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding)

classifiers = {
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Perceptron': Perceptron(random_state=42),
    'SGD': SGDClassifier(random_state=42),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}


# Prepare for 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)
scorers = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, zero_division=0),
    'Recall': make_scorer(recall_score),
    'F1 Score': make_scorer(f1_score),
    'ROC AUC': 'roc_auc'  # Directly use 'roc_auc' for AUC scoring
}

results = pd.DataFrame()

# Perform 10-fold cross-validation for each classifier
for name, clf in classifiers.items():
    print(f"Evaluating {name}")
    scores = {metric: cross_val_score(clf, X, y, scoring=scorer, cv=cv) for metric, scorer in scorers.items()}
    results[name] = [scores['Accuracy'].mean(), scores['Precision'].mean(), scores['Recall'].mean(), scores['F1 Score'].mean(), scores['ROC AUC'].mean()]

results.index = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
results.to_csv('classification_results_10_fold_cv.csv')

print("10-fold cross-validation results saved to 'classification_results_10_fold_cv.csv'")


Evaluating SVM
Evaluating Random Forest
Evaluating Gradient Boosting
Evaluating AdaBoost
Evaluating Decision Tree
Evaluating Perceptron
Evaluating SGD
Evaluating Bagging
Evaluating Extra Trees
Evaluating CatBoost
Evaluating LightGBM
[LightGBM] [Info] Number of positive: 1379, number of negative: 1380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 2759, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499819 -> initscore=-0.000725
[LightGBM] [Info] Start training from score -0.000725
[LightGBM] [Info] Number of positive: 1379, number of negative: 1380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4335
[LightGBM] 

Results for SVM:
    Accuracy: 0.6645
    Precision: 0.6343
    Recall: 0.7400
    F1 Score: 0.6831

Results for Random Forest:
    Accuracy: 0.6775
    Precision: 0.6564
    Recall: 0.7133
    F1 Score: 0.6837

Results for Gradient Boosting:
    Accuracy: 0.6645
    Precision: 0.6374
    Recall: 0.7267
    F1 Score: 0.6791

Results for AdaBoost:
    Accuracy: 0.6515
    Precision: 0.6229
    Recall: 0.7267
    F1 Score: 0.6708

Results for Decision Tree:
    Accuracy: 0.6319
    Precision: 0.6276
    Recall: 0.6067
    F1 Score: 0.6169

Results for Perceptron:
    Accuracy: 0.4886
    Precision: 0.4886
    Recall: 1.0000
    F1 Score: 0.6565

Results for SGD:
    Accuracy: 0.6938
    Precision: 0.6538
    Recall: 0.7933
    F1 Score: 0.7169

Results for Bagging:
    Accuracy: 0.6743
    Precision: 0.6562
    Recall: 0.7000
    F1 Score: 0.6774

Results for Extra Trees:
    Accuracy: 0.6775
    Precision: 0.6545
    Recall: 0.7200
    F1 Score: 0.6857

Results for CatBoost:
    Accurac

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Function to transform sequence data based on the encoding scheme
def transform_sequence(sequence, encoding):
    numerical_data = []
    for i in range(0, len(sequence), 2):  # Assuming the sequences can be divided into di-nucleotides
        di_nucleotide = sequence[i:i+2]
        numerical_data.append(encoding.get(di_nucleotide, 0))  # Default 0 for unknown di-nucleotides
    
    # Split the numerical data into non-overlapping windows of size 10 and calculate the average
    window_avgs = [np.mean(numerical_data[i:i+2]) for i in range(0, len(numerical_data), 2)]
    return window_avgs

# Encoding scheme
encoding = {
    'AA': 0.00, 'AC': 0.50, 'AG': 0.50, 'AT': 0.00,
    'CA': 0.50, 'CC': 1.00, 'CG': 1.00, 'CT': 0.50,
    'GA': 0.50, 'GC': 1.00, 'GG': 1.00, 'GT': 0.50,
    'TA': 0.00, 'TC': 0.50, 'TG': 0.50, 'TT': 0.00
}

# Function to load and transform data
def load_and_transform_data(pos_file_path, neg_file_path, encoding):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    # Handle varying window counts by padding with NaN and later imputing or dropping
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

# Load and transform data
X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)



classifiers = {
    # Excluded LDA and QDA due to collinearity warning
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    'Logistic Regression': LogisticRegression(random_state=101),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(random_state=101),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}

# Train and evaluate classifiers
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Results for {name}:")
    print(f"    Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"    Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}\n")

# Create a list of tuples containing classifier name and the classifier object
classifier_list = [(name, clf) for name, clf in classifiers.items()]

# Create a VotingClassifier with soft voting
voting_clf_soft = VotingClassifier(estimators=classifier_list, voting='soft')

# Optionally, create a VotingClassifier with hard voting
voting_clf_hard = VotingClassifier(estimators=classifier_list, voting='hard')

# Train the ensemble classifier
voting_clf_soft.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_soft = voting_clf_soft.predict(X_test)
print(f"Soft Voting Accuracy: {accuracy_score(y_test, y_pred_soft):.4f}")

# Train the hard voting classifier and evaluate (if needed)
voting_clf_hard.fit(X_train, y_train)
y_pred_hard = voting_clf_hard.predict(X_test)
print(f"Hard Voting Accuracy: {accuracy_score(y_test, y_pred_hard):.4f}")



print("Voting Classifier Performance:")
print(f"    Accuracy: {accuracy_score(y_test, y_pred_vote):.4f}")
print(f"    Precision: {precision_score(y_test, y_pred_vote, zero_division=0):.4f}")
print(f"    Recall: {recall_score(y_test, y_pred_vote, zero_division=0):.4f}")
print(f"    F1 Score: {f1_score(y_test, y_pred_vote, zero_division=0):.4f}")


Results for SVM:
    Accuracy: 0.7647
    Precision: 0.7857
    Recall: 0.7857
    F1 Score: 0.7857

Results for Random Forest:
    Accuracy: 0.7255
    Precision: 0.7333
    Recall: 0.7857
    F1 Score: 0.7586

Results for Logistic Regression:
    Accuracy: 0.6863
    Precision: 0.7143
    Recall: 0.7143
    F1 Score: 0.7143

Results for Naive Bayes:
    Accuracy: 0.7843
    Precision: 0.7931
    Recall: 0.8214
    F1 Score: 0.8070

Results for K-NN:
    Accuracy: 0.6471
    Precision: 0.7500
    Recall: 0.5357
    F1 Score: 0.6250

Results for Gradient Boosting:
    Accuracy: 0.6667
    Precision: 0.7200
    Recall: 0.6429
    F1 Score: 0.6792

Results for AdaBoost:
    Accuracy: 0.6863
    Precision: 0.6875
    Recall: 0.7857
    F1 Score: 0.7333

Results for Decision Tree:
    Accuracy: 0.5686
    Precision: 0.6154
    Recall: 0.5714
    F1 Score: 0.5926

Results for Perceptron:
    Accuracy: 0.5882
    Precision: 0.6522
    Recall: 0.5357
    F1 Score: 0.5882

Results for SGD:
   

AttributeError: 'Perceptron' object has no attribute 'predict_proba'

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV

# Function to transform sequence data based on the encoding scheme
def transform_sequence(sequence, encoding):
    numerical_data = []
    for i in range(0, len(sequence), 2):  # Assuming the sequences can be divided into di-nucleotides
        di_nucleotide = sequence[i:i+2]
        numerical_data.append(encoding.get(di_nucleotide, 0))  # Default 0 for unknown di-nucleotides
    
    # Split the numerical data into non-overlapping windows of size 10 and calculate the average
    window_avgs = [np.mean(numerical_data[i:i+10]) for i in range(0, len(numerical_data), 10)]
    return window_avgs

# Encoding scheme
encoding = {
    'AA': 0.00, 'AC': 0.50, 'AG': 0.50, 'AT': 0.00,
    'CA': 0.50, 'CC': 1.00, 'CG': 1.00, 'CT': 0.50,
    'GA': 0.50, 'GC': 1.00, 'GG': 1.00, 'GT': 0.50,
    'TA': 0.00, 'TC': 0.50, 'TG': 0.50, 'TT': 0.00
}

# Function to load and transform data
def load_and_transform_data(pos_file_path, neg_file_path, encoding):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    # Handle varying window counts by padding with NaN and later imputing or dropping
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

# Load and transform data
X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)



classifiers = {
    # Excluded LDA and QDA due to collinearity warning
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    'Logistic Regression': LogisticRegression(random_state=101),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    #'Perceptron': Perceptron(random_state=101),
    #'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(random_state=101),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}


# Train individual classifiers and print their performance
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Results for {name}:")
    print(f"    Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"    Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"    F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}\n")

# Create a VotingClassifier with soft voting (ensure all classifiers support predict_proba)
voting_clf_soft = VotingClassifier(estimators=list(classifiers.items()), voting='soft')
voting_clf_soft.fit(X_train, y_train)
y_pred_soft = voting_clf_soft.predict(X_test)
print("Soft Voting Classifier Performance:")
print(f"    Accuracy: {accuracy_score(y_test, y_pred_soft):.4f}")
print(f"    Precision: {precision_score(y_test, y_pred_soft, zero_division=0):.4f}")
print(f"    Recall: {recall_score(y_test, y_pred_soft, zero_division=0):.4f}")
print(f"    F1 Score: {f1_score(y_test, y_pred_soft, zero_division=0):.4f}")

# Optionally, create and evaluate a VotingClassifier with hard voting
voting_clf_hard = VotingClassifier(estimators=list(classifiers.items()), voting='hard')
voting_clf_hard.fit(X_train, y_train)
y_pred_hard = voting_clf_hard.predict(X_test)
print("Hard Voting Classifier Performance:")
print(f"    Accuracy: {accuracy_score(y_test, y_pred_hard):.4f}")
print(f"    Precision: {precision_score(y_test, y_pred_hard, zero_division=0):.4f}")
print(f"    Recall: {recall_score(y_test, y_pred_hard, zero_division=0):.4f}")
print(f"    F1 Score: {f1_score(y_test, y_pred_hard, zero_division=0):.4f}")


Results for SVM:
    Accuracy: 0.7850
    Precision: 0.7471
    Recall: 0.8467
    F1 Score: 0.7937

Results for Random Forest:
    Accuracy: 0.7915
    Precision: 0.7622
    Recall: 0.8333
    F1 Score: 0.7962

Results for Logistic Regression:
    Accuracy: 0.7818
    Precision: 0.7578
    Recall: 0.8133
    F1 Score: 0.7846

Results for Naive Bayes:
    Accuracy: 0.8046
    Precision: 0.7711
    Recall: 0.8533
    F1 Score: 0.8101

Results for K-NN:
    Accuracy: 0.7296
    Precision: 0.7006
    Recall: 0.7800
    F1 Score: 0.7382

Results for Gradient Boosting:
    Accuracy: 0.7687
    Precision: 0.7484
    Recall: 0.7933
    F1 Score: 0.7702

Results for AdaBoost:
    Accuracy: 0.7785
    Precision: 0.7562
    Recall: 0.8067
    F1 Score: 0.7806

Results for Decision Tree:
    Accuracy: 0.7003
    Precision: 0.6747
    Recall: 0.7467
    F1 Score: 0.7089

Results for Bagging:
    Accuracy: 0.7883
    Precision: 0.7707
    Recall: 0.8067
    F1 Score: 0.7883

Results for Extra Trees

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV

# Function to transform sequence data
def transform_sequence(sequence, encoding):
    numerical_data = []
    for i in range(0, len(sequence), 2):
        di_nucleotide = sequence[i:i+2]
        numerical_data.append(encoding.get(di_nucleotide, 0))
    window_avgs = [np.mean(numerical_data[i:i+10]) for i in range(0, len(numerical_data), 10)]
    return window_avgs

# Encoding scheme
encoding = {
    'AA': 0.00, 'AC': 0.50, 'AG': 0.50, 'AT': 0.00,
    'CA': 0.50, 'CC': 1.00, 'CG': 1.00, 'CT': 0.50,
    'GA': 0.50, 'GC': 1.00, 'GG': 1.00, 'GT': 0.50,
    'TA': 0.00, 'TC': 0.50, 'TG': 0.50, 'TT': 0.00
}

# Load and transform data function
def load_and_transform_data(pos_file_path, neg_file_path, encoding):
    combined_data = pd.DataFrame()
    for file_path, label in [(pos_file_path, 1), (neg_file_path, 0)]:
        with open(file_path, 'r') as file:
            sequences = file.read().split('\n')
        transformed_data = [transform_sequence(seq, encoding) for seq in sequences if seq]
        labels = [label] * len(transformed_data)
        df = pd.DataFrame(transformed_data)
        df['label'] = labels
        combined_data = pd.concat([combined_data, df], ignore_index=True)
    combined_data = combined_data.apply(lambda x: pd.Series(x.dropna().values)).fillna(0)
    return combined_data.drop('label', axis=1), combined_data['label']

# Load and transform data
X, y = load_and_transform_data('oripos.txt', 'orineg.txt', encoding)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Base classifiers for stacking
base_classifiers = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=101)),
    ('svc', SVC(probability=True, random_state=101)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=101)),
    ('dt', DecisionTreeClassifier(random_state=101)),
    ('lr', LogisticRegression(random_state=101)),
    ('gnb', GaussianNB()),
    ('adb', AdaBoostClassifier(n_estimators=100, random_state=101)),
    ('et', ExtraTreesClassifier(n_estimators=100, random_state=101)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)),
    ('lgbm', LGBMClassifier(random_state=101)),
    ('catboost', CatBoostClassifier(verbose=0, random_state=101)),
    ('cal_sgd', CalibratedClassifierCV(SGDClassifier(random_state=101), method='sigmoid', cv=5))
]

# Meta-classifier for stacking
meta_classifier = RandomForestClassifier(n_estimators=100, random_state=101)

# Stacking classifier
stacking_clf = StackingClassifier(estimators=base_classifiers, final_estimator=meta_classifier, cv=5)
stacking_clf.fit(X_train, y_train)

# Predictions and evaluation
y_pred_stack = stacking_clf.predict(X_test)
print("Stacking Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_stack, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_stack, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_stack, zero_division=0):.4f}")


[LightGBM] [Info] Number of positive: 264, number of negative: 283
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 357
[LightGBM] [Info] Number of data points in the train set: 547, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.482633 -> initscore=-0.069498
[LightGBM] [Info] Start training from score -0.069498
[LightGBM] [Info] Number of positive: 211, number of negative: 226
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 348
[LightGBM] [Info] Number of data points in the train set: 437, number of used features: 25
[LightGBM] [Info] [binary:BoostF

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from collections import Counter
import math
import time

# Encoding schemes
encodings = {
    'free_energy': {
        'AA': -1.0, 'AC': -1.44, 'AG': -1.28, 'AT': -0.88, 
        'CA': -1.45, 'CC': -1.84, 'CG': -2.17, 'CT': -1.28, 
        'GA': -1.30, 'GC': -2.24, 'GG': -1.84, 'GT': -1.44, 
        'TA': -0.58, 'TC': -1.30, 'TG': -1.45, 'TT': -1.0
    },
    # Add other encodings here if needed
}

# Function to calculate dinucleotide perplexity
def calculate_perplexity(seq, window_size=17):
    if len(seq) < window_size:
        return []

    seq_array = np.array(list(seq))
    perplexities = []

    for i in range(0, len(seq) - window_size + 1, 8):
        window = seq_array[i:i + window_size]
        dinucleotide_counts = Counter("".join(window[j:j + 2]) for j in range(window_size - 1))
        total_dinucleotides = window_size - 1

        probabilities = {k: v / total_dinucleotides for k, v in dinucleotide_counts.items()}
        entropy = -sum(p * math.log2(p) for p in probabilities.values() if p > 0)
        perplexity = 2 ** entropy
        perplexities.append(perplexity)
    
    return perplexities

# Function to transform sequence data
def transform_sequence(sequence, encodings, window_size=17):
    transformed = []
    for encoding in encodings.values():
        value = sum(encoding.get(sequence[i:i+2], 0) for i in range(len(sequence) - 1)) / (len(sequence) - 1)
        transformed.append(value)
    
    # Add perplexity as an additional feature
    perplexities = calculate_perplexity(sequence, window_size)
    if perplexities:
        transformed.append(np.mean(perplexities))
    else:
        transformed.append(0)
    
    return transformed

# Function to load and transform data from specific regions
def load_and_transform_data(file_path, pos_range, neg_range, encodings):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    
    pos_data = pd.DataFrame([transform_sequence(seq, encodings) for seq in pos_sequences])
    pos_data['label'] = 1
    neg_data = pd.DataFrame([transform_sequence(seq, encodings) for seq in neg_sequences])
    neg_data['label'] = 0
    
    return pd.concat([pos_data, neg_data], ignore_index=True)

# Classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=101),
    'SVM': SVC(probability=True, random_state=101),
    'Logistic Regression': LogisticRegression(max_iter=5000, random_state=101),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=101),
    'AdaBoost': AdaBoostClassifier(random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': SGDClassifier(loss='perceptron', random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101, iterations=100),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}

# Combinations of positive and negative sequence ranges
combinations = [
    ((5000, 5100), (1000, 1100)),
    ((5000, 5200), (1000, 1200)),
    ((5000, 5300), (1000, 1300)),
    ((5000, 5400), (1000, 1400)),
    ((5000, 5500), (1000, 1500))
]

# Main process
start_time = time.time()
results = []
for pos_range, neg_range in combinations:
    data = load_and_transform_data('Arabidopsis_correct_10001', pos_range, neg_range, encodings)
    X_train, X_test, y_train, y_test = train_test_split(data.drop('label', axis=1), data['label'], test_size=0.1, random_state=101)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append({
            'Combination': f'Pos: {pos_range}, Neg: {neg_range}',
            'Classifier': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1 Score': f1_score(y_test, y_pred, zero_division=0)
        })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(f"Execution Time: {time.time() - start_time:.2f} seconds")
print(results_df)

# Save results to an Excel file
results_df.to_excel('classification_results.xlsx', index=False)

# Save to a CSV file
results_df.to_csv('classification_results.csv', index=False)


Execution Time: 62.75 seconds
                             Combination           Classifier  Accuracy  \
0   Pos: (5000, 5100), Neg: (1000, 1100)        Random Forest  0.485342   
1   Pos: (5000, 5100), Neg: (1000, 1100)                  SVM  0.488599   
2   Pos: (5000, 5100), Neg: (1000, 1100)  Logistic Regression  0.537459   
3   Pos: (5000, 5100), Neg: (1000, 1100)          Naive Bayes  0.534202   
4   Pos: (5000, 5100), Neg: (1000, 1100)                 K-NN  0.547231   
..                                   ...                  ...       ...   
65  Pos: (5000, 5500), Neg: (1000, 1500)                  SGD  0.765472   
66  Pos: (5000, 5500), Neg: (1000, 1500)              Bagging  0.713355   
67  Pos: (5000, 5500), Neg: (1000, 1500)          Extra Trees  0.716612   
68  Pos: (5000, 5500), Neg: (1000, 1500)             CatBoost  0.752443   
69  Pos: (5000, 5500), Neg: (1000, 1500)              XGBoost  0.732899   

    Precision    Recall  F1 Score  
0    0.474026  0.486667  0.480263

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from collections import Counter
import math
import time

# Function to calculate dinucleotide perplexity
def calculate_perplexity(seq, window_size=17):
    if len(seq) < window_size:
        return []

    seq_array = np.array(list(seq))
    perplexities = []

    for i in range(0, len(seq) - window_size + 1, 8):
        window = seq_array[i:i + window_size]
        dinucleotide_counts = Counter("".join(window[j:j + 2]) for j in range(window_size - 1))
        total_dinucleotides = window_size - 1

        probabilities = {k: v / total_dinucleotides for k, v in dinucleotide_counts.items()}
        entropy = -sum(p * math.log2(p) for p in probabilities.values() if p > 0)
        perplexity = 2 ** entropy
        perplexities.append(perplexity)
    
    return perplexities

# Function to transform sequence data
def transform_sequence(sequence, window_size=17):
    transformed = []

    # Add perplexity as the only feature
    perplexities = calculate_perplexity(sequence, window_size)
    if perplexities:
        transformed.append(np.mean(perplexities))
    else:
        transformed.append(0)
    
    return transformed

# Function to load and transform data from specific regions
def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    
    pos_data = pd.DataFrame([transform_sequence(seq) for seq in pos_sequences])
    pos_data['label'] = 1
    neg_data = pd.DataFrame([transform_sequence(seq) for seq in neg_sequences])
    neg_data['label'] = 0
    
    return pd.concat([pos_data, neg_data], ignore_index=True)

# Classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=101),
    'SVM': SVC(probability=True, random_state=101),
    'Logistic Regression': LogisticRegression(max_iter=5000, random_state=101),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=101),
    'AdaBoost': AdaBoostClassifier(random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': SGDClassifier(loss='perceptron', random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101, iterations=100),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
}

# Combinations of positive and negative sequence ranges
combinations = [
    ((5000, 5100), (1000, 1100)),
    ((5000, 5200), (1000, 1200)),
    ((5000, 5300), (1000, 1300)),
    ((5000, 5400), (1000, 1400)),
    ((5000, 5500), (1000, 1500))
]

# Main process
start_time = time.time()
results = []
for pos_range, neg_range in combinations:
    data = load_and_transform_data('Arabidopsis_correct_10001', pos_range, neg_range)
    X_train, X_test, y_train, y_test = train_test_split(data.drop('label', axis=1), data['label'], test_size=0.1, random_state=101)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append({
            'Combination': f'Pos: {pos_range}, Neg: {neg_range}',
            'Classifier': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1 Score': f1_score(y_test, y_pred, zero_division=0)
        })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(f"Execution Time: {time.time() - start_time:.2f} seconds")
print(results_df)

# Save results to an Excel file
results_df.to_excel('classification_results.xlsx', index=False)

# Save to a CSV file
results_df.to_csv('classification_results.csv', index=False)


Execution Time: 48.03 seconds
                             Combination           Classifier  Accuracy  \
0   Pos: (5000, 5100), Neg: (1000, 1100)        Random Forest  0.514658   
1   Pos: (5000, 5100), Neg: (1000, 1100)                  SVM  0.501629   
2   Pos: (5000, 5100), Neg: (1000, 1100)  Logistic Regression  0.498371   
3   Pos: (5000, 5100), Neg: (1000, 1100)          Naive Bayes  0.521173   
4   Pos: (5000, 5100), Neg: (1000, 1100)                 K-NN  0.462541   
..                                   ...                  ...       ...   
65  Pos: (5000, 5500), Neg: (1000, 1500)                  SGD  0.638436   
66  Pos: (5000, 5500), Neg: (1000, 1500)              Bagging  0.527687   
67  Pos: (5000, 5500), Neg: (1000, 1500)          Extra Trees  0.517915   
68  Pos: (5000, 5500), Neg: (1000, 1500)             CatBoost  0.622150   
69  Pos: (5000, 5500), Neg: (1000, 1500)              XGBoost  0.557003   

    Precision    Recall  F1 Score  
0    0.503311  0.506667  0.504983

PermissionError: [Errno 13] Permission denied: 'classification_results.csv'